diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 85276bd24bcf4..c8bd10ab9ea89 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -47,6 +47,11 @@ namespace llvm::AMDGPU { #include "AMDGPUGenSearchableTables.inc" } // namespace llvm::AMDGPU +static cl::opt EnableDiffBasePtrMemClustering( + "amdgpu-enable-diff-baseptr-mem-clustering", + cl::desc("Enable clustering memory ops with different base pointers"), + cl::init(true), cl::Hidden); + // Must be at least 4 to be able to branch over minimum unconditional branch // code. This is only for making it possible to write reasonably small tests for // long branches. @@ -522,6 +527,22 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth( return false; } +static bool memOpsHaveSameAddrspace(const MachineInstr &MI1, + ArrayRef BaseOps1, + const MachineInstr &MI2, + ArrayRef BaseOps2) { + // If base is identical, assume identical addrspace + if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front())) + return true; + + if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) + return false; + + auto *MO1 = *MI1.memoperands_begin(); + auto *MO2 = *MI2.memoperands_begin(); + return MO1->getAddrSpace() == MO2->getAddrSpace(); +} + static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef BaseOps1, const MachineInstr &MI2, @@ -559,14 +580,25 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef BaseOps1, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const { - // If the mem ops (to be clustered) do not have the same base ptr, then they - // should not be clustered unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit; if (!BaseOps1.empty() && !BaseOps2.empty()) { const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); - if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) - return false; + + if (EnableDiffBasePtrMemClustering) { + // Only consider memory ops from same addrspace for clustering + if (!memOpsHaveSameAddrspace(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) + return false; + + // Don't cluster scalar and vector memory ops + if (isVMEM(FirstLdSt) != isVMEM(SecondLdSt)) + return false; + } else { + // If the mem ops (to be clustered) do not have the same base ptr, then + // they should not be clustered + if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) + return false; + } const SIMachineFunctionInfo *MFI = FirstLdSt.getMF()->getInfo(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll index 27b93872b9f1d..f562d958529d1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll @@ -8,31 +8,31 @@ define void @add_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v0 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v8, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v9, v[6:7] -; GFX8-NEXT: flat_load_ushort v10, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v10, v[8:9] +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 2, v2 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_ushort v11, v[0:1] +; GFX8-NEXT: flat_load_ushort v12, v[2:3] +; GFX8-NEXT: flat_load_ushort v8, v[8:9] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_ushort v11, v[2:3] -; GFX8-NEXT: flat_load_ushort v12, v[0:1] ; GFX8-NEXT: flat_load_ushort v6, v[6:7] +; GFX8-NEXT: flat_load_ushort v7, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u16_e32 v7, v8, v11 +; GFX8-NEXT: s_waitcnt vmcnt(3) +; GFX8-NEXT: v_add_u16_e32 v9, v11, v12 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u16_e32 v8, v9, v12 +; GFX8-NEXT: v_add_u16_e32 v6, v6, v8 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v6, v10, v6 -; GFX8-NEXT: flat_store_short v[4:5], v7 -; GFX8-NEXT: flat_store_short v[0:1], v8 -; GFX8-NEXT: flat_store_short v[2:3], v6 +; GFX8-NEXT: v_add_u16_e32 v7, v10, v7 +; GFX8-NEXT: flat_store_short v[4:5], v9 +; GFX8-NEXT: flat_store_short v[0:1], v6 +; GFX8-NEXT: flat_store_short v[2:3], v7 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -153,28 +153,28 @@ define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v0 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v12, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v13, v[6:7] -; GFX8-NEXT: flat_load_ushort v14, v[8:9] -; GFX8-NEXT: flat_load_ushort v15, v[10:11] -; GFX8-NEXT: flat_load_ushort v16, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2 +; GFX8-NEXT: flat_load_ushort v12, v[6:7] +; GFX8-NEXT: flat_load_ushort v13, v[8:9] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 6, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 8, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v14, v[6:7] +; GFX8-NEXT: flat_load_ushort v15, v[8:9] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v2 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 6, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v2 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 8, v2 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v2 ; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_ushort v16, v[0:1] ; GFX8-NEXT: flat_load_ushort v17, v[2:3] -; GFX8-NEXT: flat_load_ushort v18, v[0:1] -; GFX8-NEXT: flat_load_ushort v19, v[6:7] -; GFX8-NEXT: flat_load_ushort v20, v[8:9] +; GFX8-NEXT: flat_load_ushort v18, v[6:7] +; GFX8-NEXT: flat_load_ushort v19, v[8:9] ; GFX8-NEXT: flat_load_ushort v10, v[10:11] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_ushort v11, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4 @@ -184,20 +184,20 @@ define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 8, v4 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u16_e32 v11, v12, v17 +; GFX8-NEXT: v_add_u16_e32 v16, v16, v17 ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u16_e32 v12, v13, v18 +; GFX8-NEXT: v_add_u16_e32 v12, v12, v18 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u16_e32 v13, v14, v19 +; GFX8-NEXT: v_add_u16_e32 v13, v13, v19 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u16_e32 v14, v15, v20 +; GFX8-NEXT: v_add_u16_e32 v10, v14, v10 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v10, v16, v10 -; GFX8-NEXT: flat_store_short v[4:5], v11 +; GFX8-NEXT: v_add_u16_e32 v11, v15, v11 +; GFX8-NEXT: flat_store_short v[4:5], v16 ; GFX8-NEXT: flat_store_short v[0:1], v12 ; GFX8-NEXT: flat_store_short v[2:3], v13 -; GFX8-NEXT: flat_store_short v[6:7], v14 -; GFX8-NEXT: flat_store_short v[8:9], v10 +; GFX8-NEXT: flat_store_short v[6:7], v10 +; GFX8-NEXT: flat_store_short v[8:9], v11 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -513,25 +513,25 @@ define void @add_v9i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v14, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: flat_load_ushort v1, v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u16_e32 v1, v6, v10 -; GFX8-NEXT: v_add_u16_sdwa v2, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v3, v7, v11 -; GFX8-NEXT: v_add_u16_sdwa v10, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v11, v8, v12 +; GFX8-NEXT: v_add_u16_e32 v2, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v3, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v10, v7, v11 +; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v14, v8, v12 ; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e32 v12, v9, v13 ; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v13, v14, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v1, v2 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v10 -; GFX8-NEXT: v_or_b32_e32 v2, v11, v8 +; GFX8-NEXT: v_add_u16_e32 v13, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v10, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v14, v8 ; GFX8-NEXT: v_or_b32_e32 v3, v12, v9 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -604,10 +604,10 @@ define void @add_v10i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v14, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_dword v15, v[0:1] +; GFX8-NEXT: flat_load_dword v15, v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_add_u16_e32 v0, v6, v10 ; GFX8-NEXT: v_add_u16_sdwa v1, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -663,53 +663,53 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v2 -; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v16, vcc, 18, v2 -; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_ushort v14, v[14:15] -; GFX8-NEXT: flat_load_ushort v15, v[16:17] -; GFX8-NEXT: flat_load_ushort v16, v[2:3] -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u16_e32 v17, v6, v10 -; GFX8-NEXT: v_add_u16_sdwa v10, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 18, v0 -; GFX8-NEXT: v_add_u16_e32 v18, v7, v11 -; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v14, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v15, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v0 +; GFX8-NEXT: v_add_u16_e32 v16, v7, v11 +; GFX8-NEXT: v_add_u16_sdwa v17, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u16_e32 v18, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v12, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 18, v0 +; GFX8-NEXT: v_add_u16_e32 v19, v9, v13 +; GFX8-NEXT: v_add_u16_sdwa v13, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_ushort v20, v[6:7] +; GFX8-NEXT: flat_load_ushort v21, v[8:9] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 18, v2 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 20, v0 -; GFX8-NEXT: flat_load_ushort v2, v[2:3] -; GFX8-NEXT: flat_load_ushort v3, v[6:7] ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v21, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v2 +; GFX8-NEXT: flat_load_ushort v10, v[10:11] +; GFX8-NEXT: flat_load_ushort v11, v[6:7] +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_ushort v22, v[0:1] +; GFX8-NEXT: flat_load_ushort v2, v[2:3] ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GFX8-NEXT: v_add_u16_e32 v19, v8, v12 -; GFX8-NEXT: v_add_u16_sdwa v12, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 18, v4 -; GFX8-NEXT: v_add_u16_e32 v20, v9, v13 -; GFX8-NEXT: v_add_u16_sdwa v13, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v17, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v18, v11 +; GFX8-NEXT: v_or_b32_e32 v0, v14, v15 +; GFX8-NEXT: v_or_b32_e32 v1, v16, v17 +; GFX8-NEXT: v_or_b32_e32 v3, v19, v13 +; GFX8-NEXT: s_waitcnt vmcnt(3) +; GFX8-NEXT: v_add_u16_e32 v20, v20, v10 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, 20, v4 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u16_e32 v14, v2, v14 -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u16_e32 v15, v3, v15 -; GFX8-NEXT: v_or_b32_e32 v2, v19, v12 -; GFX8-NEXT: v_or_b32_e32 v3, v20, v13 +; GFX8-NEXT: v_add_u16_e32 v21, v21, v11 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v16, v21, v16 +; GFX8-NEXT: v_add_u16_e32 v14, v22, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v18, v12 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: flat_store_short v[6:7], v14 -; GFX8-NEXT: flat_store_short v[8:9], v15 -; GFX8-NEXT: flat_store_short v[10:11], v16 +; GFX8-NEXT: flat_store_short v[6:7], v20 +; GFX8-NEXT: flat_store_short v[8:9], v21 +; GFX8-NEXT: flat_store_short v[10:11], v14 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -794,34 +794,34 @@ define void @add_v12i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u16_e32 v2, v6, v10 -; GFX8-NEXT: v_add_u16_sdwa v3, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v10, v7, v11 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v14, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v10, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v15, v7, v11 ; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: v_add_u16_e32 v16, v8, v12 -; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v12, v9, v13 -; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v10, v11 -; GFX8-NEXT: v_or_b32_e32 v2, v16, v8 -; GFX8-NEXT: v_or_b32_e32 v3, v12, v9 +; GFX8-NEXT: v_add_u16_sdwa v12, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v17, v9, v13 +; GFX8-NEXT: v_add_u16_sdwa v13, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[2:3] +; GFX8-NEXT: v_or_b32_e32 v0, v14, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v15, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v16, v12 +; GFX8-NEXT: v_or_b32_e32 v3, v17, v13 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u16_e32 v8, v6, v14 -; GFX8-NEXT: v_add_u16_sdwa v6, v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v9, v7, v15 -; GFX8-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v10, v6, v8 +; GFX8-NEXT: v_add_u16_sdwa v6, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v8, v7, v9 +; GFX8-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v4 -; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 -; GFX8-NEXT: v_or_b32_e32 v7, v9, v7 +; GFX8-NEXT: v_or_b32_e32 v6, v10, v6 +; GFX8-NEXT: v_or_b32_e32 v7, v8, v7 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[6:7] ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index 5733cf9a44d32..d98dc6d7f6938 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -645,6 +645,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1030-NEXT: s_clause 0x1 ; GFX1030-NEXT: flat_load_dword v0, v[0:1] ; GFX1030-NEXT: flat_load_dword v1, v[2:3] ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 @@ -674,6 +675,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 +; GFX1013-NEXT: s_clause 0x1 ; GFX1013-NEXT: flat_load_dword v0, v[4:5] ; GFX1013-NEXT: flat_load_dword v1, v[2:3] ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 @@ -711,6 +713,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 @@ -757,6 +760,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1030-NEXT: s_clause 0x1 ; GFX1030-NEXT: flat_load_dword v0, v[0:1] ; GFX1030-NEXT: flat_load_dword v1, v[2:3] ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 @@ -783,6 +787,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500 +; GFX1013-NEXT: s_clause 0x1 ; GFX1013-NEXT: flat_load_dword v0, v[4:5] ; GFX1013-NEXT: flat_load_dword v1, v[2:3] ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 @@ -816,6 +821,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll index 603eb88c07afb..e6f5b7a295dfa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll @@ -77,10 +77,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[32:39] ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[40:41], v[44:47], off ; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[42:43], v[32:35], off ; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 ; W32-NEXT: s_endpgm @@ -102,10 +101,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39] ; W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[44:51] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[40:41], v[44:47], off ; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[42:43], v[32:35], off ; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 ; W32-NEXT: s_endpgm @@ -152,10 +150,9 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[32:39] ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[40:41], v[44:47], off ; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[42:43], v[32:35], off ; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 ; W32-NEXT: s_endpgm @@ -177,10 +174,9 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39] ; W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[44:51] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[40:41], v[44:47], off ; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[42:43], v[32:35], off ; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 ; W32-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll index 7deaca4ca78b4..e79c398e74f68 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll @@ -69,6 +69,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[32:35] ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[36:37], v[40:43], off ; W64-NEXT: global_store_b128 v[38:39], v[32:35], off ; W64-NEXT: s_endpgm @@ -90,6 +91,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35] ; W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[40:43] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[36:37], v[40:43], off ; W64-NEXT: global_store_b128 v[38:39], v[32:35], off ; W64-NEXT: s_endpgm @@ -132,6 +134,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[32:35] ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[36:37], v[40:43], off ; W64-NEXT: global_store_b128 v[38:39], v[32:35], off ; W64-NEXT: s_endpgm @@ -153,6 +156,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35] ; W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[40:43] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[36:37], v[40:43], off ; W64-NEXT: global_store_b128 v[38:39], v[32:35], off ; W64-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index c295a662704e9..7ad04f3a5de64 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -126,17 +126,17 @@ define amdgpu_kernel void @localize_globals(i1 %cond) { ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, gv0@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, gv0@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, gv1@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, gv1@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9-NEXT: global_store_dword v0, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB1_4: ; %bb2 ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll index c87c334217b77..5644d3a42b0c9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -61,6 +61,7 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-NEXT: global_load_dword v4, v3, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -82,6 +83,7 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[0:1], v1, s[2:3] ; GFX11-NEXT: global_load_b32 v5, v2, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -113,6 +115,7 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v2, s[2:3] ; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -134,6 +137,7 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll index 7eafe53ea84cf..5dd768796dd7c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll @@ -13,10 +13,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20 ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 -; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off ; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 -; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off ; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 ; GFX12-NEXT: s_endpgm @@ -43,10 +42,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 -; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off ; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 -; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off ; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 ; GFX12-NEXT: s_endpgm @@ -71,6 +69,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off ; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off ; GFX12-NEXT: s_endpgm @@ -95,6 +94,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off ; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off ; GFX12-NEXT: s_endpgm @@ -121,10 +121,9 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 -; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off ; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 -; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 ; GFX12-NEXT: s_endpgm @@ -151,10 +150,9 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 -; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[13:14], v[17:20], off ; GFX12-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16 -; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off ; GFX12-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16 ; GFX12-NEXT: s_endpgm @@ -181,10 +179,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 -; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off ; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 -; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 ; GFX12-NEXT: s_endpgm @@ -211,10 +208,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 -; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off ; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 -; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 ; GFX12-NEXT: s_endpgm @@ -241,10 +237,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 -; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off ; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 -; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 ; GFX12-NEXT: s_endpgm @@ -271,10 +266,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 -; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off ; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 -; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll index 1e9ef07ba7542..af61f614519c0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll @@ -23,6 +23,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off ; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off ; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off @@ -67,6 +68,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off ; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off ; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off @@ -105,6 +107,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off ; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off ; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off @@ -143,6 +146,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off ; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off ; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off @@ -187,6 +191,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off @@ -221,6 +226,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off ; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off ; GFX12-NEXT: s_endpgm @@ -247,6 +253,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7 ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off ; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off ; GFX12-NEXT: s_endpgm @@ -283,6 +290,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off @@ -327,6 +335,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off @@ -371,6 +380,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off @@ -415,6 +425,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index 50d20e9b0e4d7..443e6e6402be2 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -134,6 +134,7 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX10-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -148,6 +149,7 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index fa73ef0b0ec4c..983c415442a66 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -6289,9 +6289,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 @@ -6355,7 +6355,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -6436,9 +6436,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v32 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v31 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 @@ -6805,11 +6804,11 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24 ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v69.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h @@ -6876,8 +6875,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 ; GFX11-FAKE16-NEXT: s_clause 0x2 ; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -7086,9 +7085,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v32 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v32 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v31 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 @@ -7478,11 +7476,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v82 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v81 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v80 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v70 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v69 @@ -16419,94 +16418,94 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:376 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:368 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:128 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:184 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_b32 v99, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:224 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v166, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 @@ -16552,87 +16551,88 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v49.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v53.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v70.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v70.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v52.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v99 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v83.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v84.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v84.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v85.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v87.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v160.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v65.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v54.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v161.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v53.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v52.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v49.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v48.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v39.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v31.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v31.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v160.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v166.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v166.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v167.l ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -16736,143 +16736,143 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l ; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v82.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 ; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v81.l ; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v80.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v85.h ; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v81.h ; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v55.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v96.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v54.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v83.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v83.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v84.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v85.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v69.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v70.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v69.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v71.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v50.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v65.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v66.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v36.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v54.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v53.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v49.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v34, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v38, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v51, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 @@ -16901,39 +16901,39 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 @@ -16971,36 +16971,36 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true @@ -17144,15 +17144,15 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v82.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v82.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 @@ -17166,67 +17166,67 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v81.l, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v85.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v81.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.h, v17.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v64.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v87.l, v17.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v64.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v55.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.h, v19.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v54.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v96.l, v18.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v83.l, v17.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v83.h, v17.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h @@ -17234,13 +17234,13 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v55.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v84.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v84.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v38.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 @@ -17249,15 +17249,15 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v69.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v23.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v69.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v23.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 @@ -17265,68 +17265,68 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v51.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v50.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v71.l, v24.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v51.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v50.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v65.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v66.l, v23.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v65.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v66.h, v23.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v67.l, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v37.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v52.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v54.l, v29.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v52.h, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v51 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 @@ -17336,14 +17336,14 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v39.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v48.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v48.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v49.l, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v49.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l @@ -17353,12 +17353,12 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v34, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v38, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v51, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17366,58 +17366,64 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:616 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:612 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:608 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:604 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:596 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:592 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:516 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:492 +; GFX11-FAKE16-NEXT: s_clause 0x18 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:392 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 @@ -17427,94 +17433,94 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 ; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:376 ; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:368 ; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:128 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v112, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:72 ; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:224 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v139, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v140, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v141, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v142, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v143, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v152, off, s32 offset:328 ; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92 ; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76 @@ -17543,85 +17549,89 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v87 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v97 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v112 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v113 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v114 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v115 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v116 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v130 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v131 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v132 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v133 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v134 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v146 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v20 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v18 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v16 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v149 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v149, 8, v14 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v133, 8, v12 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v178, 8, v178 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v134, 8, v10 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v177, 8, v177 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v8 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v176 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v167 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v137 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v136 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v138 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v139 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v140 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v141 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v142 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v143 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v152 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -17716,12 +17726,12 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63 @@ -17747,26 +17757,26 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v162 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v117 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v183 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v166 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v177 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v178 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 @@ -17782,26 +17792,26 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v147 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v148 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v150 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v134 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 @@ -17817,26 +17827,26 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v70 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v114 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v97 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v99 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 @@ -17880,39 +17890,39 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 @@ -17950,36 +17960,36 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 ; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 @@ -18121,12 +18131,12 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v180, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v165, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v164, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v163, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v161, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v179, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 @@ -18172,16 +18182,16 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v162, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v135, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v128, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v119, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v117, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v145, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v144, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v129, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 @@ -18192,16 +18202,16 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v181, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v182, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v183, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v40, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v41, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v166, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v167, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v176, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v177, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v178, v26 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 @@ -18227,16 +18237,16 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v118, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v86, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v84, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v83, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v103, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v102, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v101, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v100, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v71, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 @@ -18247,16 +18257,16 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v146, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v147, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v148, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v149, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v150, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v130, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v131, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v132, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v133, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v134, v31 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 @@ -18282,13 +18292,13 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v69, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v68, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v82, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v81, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v80, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v70, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3 @@ -18302,16 +18312,16 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v112, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v113, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v114, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v115, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v116, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v87, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v96, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v97, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v98, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v99, v36 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 @@ -18340,58 +18350,64 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: .LBB14_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592 +; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:516 +; GFX11-FAKE16-NEXT: s_clause 0x18 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:592 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:596 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:604 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:608 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:612 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:616 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -21660,14 +21676,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 ; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:288 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276 @@ -22070,26 +22086,26 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v35 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 @@ -22509,30 +22525,30 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v33, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v32, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v36 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -22601,47 +22617,43 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-FAKE16-NEXT: s_clause 0x7 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:336 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:320 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 @@ -22651,88 +22663,88 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:208 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:260 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:28 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5 @@ -22750,71 +22762,64 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v46, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v47, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v181, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v182, 8, v30 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v165, 8, v31 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v96 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v97 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111 -; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v132 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v100 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v183, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v148, 8, v129 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v149 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v95 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v104 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v105 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v100, 8, v106 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v107 ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 @@ -22881,153 +22886,153 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v58 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v59 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v179 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v62 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v46 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v47 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v182 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v183 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v119 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v165 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v102 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v147 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v148 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v150 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v151 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v85 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 @@ -23036,8 +23041,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v83 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 @@ -23045,30 +23050,30 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v96 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v97 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v98 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v69 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v100 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -23240,10 +23245,10 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v57 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v43 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v160 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2 @@ -23261,9 +23266,9 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v62, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v47, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v56, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -23279,14 +23284,13 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v42 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v180 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v179 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v164 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v163 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v162 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -23297,16 +23301,16 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v128 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v103 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v102 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v60, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v61, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v44, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v45, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v46, v15 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 @@ -23317,9 +23321,9 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v41, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v176, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v177, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -23334,14 +23338,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v146 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v144 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v135 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v119 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v161 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v112 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -23352,16 +23356,16 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v134 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v118 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v181, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v183, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v40, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v165, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v166, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v167, v20 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 @@ -23372,9 +23376,9 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v151, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v132, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v133, v27 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -23389,14 +23393,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v87 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v86 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v84 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v71 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v70 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v68 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -23407,14 +23411,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v148, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v149, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v150, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v129, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v130, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v131, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -23436,16 +23440,16 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v66 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v85 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v101 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v83 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v80 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v81 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v65 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v64 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v69 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 @@ -23459,11 +23463,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v96, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v97, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v98, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v99, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v100, v32 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 @@ -23492,47 +23496,43 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-FAKE16-NEXT: .LBB15_3: ; %end ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-FAKE16-NEXT: s_clause 0x7 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:444 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:460 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB15_4: @@ -44019,9 +44019,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 @@ -44085,7 +44085,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -44164,8 +44164,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 1.0, v20 :: v_dual_add_f32 v19, 1.0, v19 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 1.0, v18 :: v_dual_add_f32 v17, 1.0, v17 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_add_f32 v24, 1.0, v24 :: v_dual_add_f32 v31, 1.0, v31 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_add_f32 v32, 1.0, v32 :: v_dual_add_f32 v23, 1.0, v23 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v15, 1.0, v15 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v26, 1.0, v26 :: v_dual_add_f32 v29, 1.0, v29 @@ -44518,11 +44518,11 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24 ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v69.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h @@ -44589,8 +44589,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 ; GFX11-FAKE16-NEXT: s_clause 0x2 ; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -44797,8 +44797,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v20, 1.0, v20 :: v_dual_add_f32 v19, 1.0, v19 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v18, 1.0, v18 :: v_dual_add_f32 v17, 1.0, v17 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_dual_add_f32 v24, 1.0, v24 :: v_dual_add_f32 v31, 1.0, v31 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 1.0, v32 :: v_dual_add_f32 v23, 1.0, v23 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v15, 1.0, v15 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v26, 1.0, v26 :: v_dual_add_f32 v29, 1.0, v29 @@ -45174,11 +45175,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v82 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v81 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v80 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v70 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v69 @@ -55136,94 +55138,94 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:376 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:368 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:128 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:184 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_b32 v99, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:224 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v166, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 @@ -55269,87 +55271,88 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v49.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v53.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v70.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v70.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v52.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v99 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v83.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v84.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v84.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v85.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v87.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v160.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v65.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v54.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v161.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v53.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v52.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v49.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v48.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v39.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v31.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v31.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v160.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v166.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v166.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v167.l ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -55453,143 +55456,143 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l ; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v82.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 ; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v81.l ; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v80.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v85.h ; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v81.h ; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v55.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v96.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v54.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v83.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v83.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v84.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v85.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v69.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v70.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v69.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v71.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v50.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v65.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v66.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v36.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v54.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v53.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v49.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v34, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v38, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v51, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 @@ -55618,39 +55621,39 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 @@ -55688,36 +55691,36 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-TRUE16-NEXT: .LBB38_4: ; %cmp.true @@ -55861,15 +55864,15 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v82.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v82.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 @@ -55883,67 +55886,67 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v81.l, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v85.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v81.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.h, v17.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v64.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v87.l, v17.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v64.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v55.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.h, v19.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v54.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v96.l, v18.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v83.l, v17.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v83.h, v17.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h @@ -55951,13 +55954,13 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v55.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v84.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v84.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v38.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 @@ -55966,15 +55969,15 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v69.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v23.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v69.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v23.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 @@ -55982,68 +55985,68 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v51.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v50.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v71.l, v24.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v51.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v50.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v65.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v66.l, v23.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v65.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v66.h, v23.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v67.l, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v37.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v52.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v54.l, v29.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v52.h, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v51 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 @@ -56053,14 +56056,14 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v39.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v48.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v48.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v49.l, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v49.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l @@ -56070,12 +56073,12 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v34, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v38, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v51, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -56083,58 +56086,64 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:616 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:612 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:608 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:604 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:596 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:592 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:516 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:492 +; GFX11-FAKE16-NEXT: s_clause 0x18 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:392 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 @@ -56144,94 +56153,94 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 ; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:376 ; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:368 ; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:128 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v112, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:72 ; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:224 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v139, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v140, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v141, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v142, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v143, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v152, off, s32 offset:328 ; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92 ; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76 @@ -56260,85 +56269,89 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v87 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v97 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v112 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v113 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v114 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v115 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v116 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v130 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v131 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v132 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v133 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v134 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v146 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v20 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v18 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v16 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v149 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v149, 8, v14 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v133, 8, v12 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v178, 8, v178 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v134, 8, v10 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v177, 8, v177 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v8 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v176 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v167 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v137 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v136 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v138 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v139 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v140 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v141 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v142 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v143 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v152 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -56433,12 +56446,12 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63 @@ -56464,26 +56477,26 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v162 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v117 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v183 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v166 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v177 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v178 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 @@ -56499,26 +56512,26 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v147 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v148 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v150 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v134 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 @@ -56534,26 +56547,26 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v70 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v114 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v97 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v99 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 @@ -56597,39 +56610,39 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 @@ -56667,36 +56680,36 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 ; GFX11-FAKE16-NEXT: .LBB38_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB38_4 @@ -56838,12 +56851,12 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v180, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v165, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v164, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v163, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v161, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v179, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 @@ -56889,16 +56902,16 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v162, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v135, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v128, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v119, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v117, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v145, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v144, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v129, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 @@ -56909,16 +56922,16 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v181, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v182, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v183, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v40, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v41, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v166, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v167, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v176, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v177, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v178, v26 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 @@ -56944,16 +56957,16 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v118, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v86, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v84, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v83, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v103, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v102, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v101, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v100, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v71, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 @@ -56964,16 +56977,16 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v146, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v147, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v148, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v149, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v150, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v130, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v131, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v132, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v133, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v134, v31 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 @@ -56999,13 +57012,13 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v69, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v68, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v82, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v81, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v80, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v70, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3 @@ -57019,16 +57032,16 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v112, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v113, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v114, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v115, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v116, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v87, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v96, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v97, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v98, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v99, v36 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 @@ -57057,58 +57070,64 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: .LBB38_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592 +; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:516 +; GFX11-FAKE16-NEXT: s_clause 0x18 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:592 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:596 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:604 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:608 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:612 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:616 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -60377,14 +60396,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 ; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:288 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276 @@ -60787,26 +60806,26 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v35 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 @@ -61226,30 +61245,30 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v33, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v32, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v36 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -61318,47 +61337,43 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-FAKE16-NEXT: s_clause 0x7 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:336 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:320 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 @@ -61368,88 +61383,88 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:208 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:260 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:28 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5 @@ -61467,71 +61482,64 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v46, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v47, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v181, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v182, 8, v30 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v165, 8, v31 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v96 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v97 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111 -; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v132 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v100 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v183, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v148, 8, v129 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v149 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v95 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v104 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v105 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v100, 8, v106 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v107 ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB39_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 @@ -61598,153 +61606,153 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v58 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v59 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v179 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v62 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v46 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v47 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v182 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v183 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v119 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v165 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v102 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v147 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v148 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v150 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v151 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v85 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 @@ -61753,8 +61761,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v83 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 @@ -61762,30 +61770,30 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v96 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v97 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v98 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v69 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v100 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -61957,10 +61965,10 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v57 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v43 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v160 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2 @@ -61978,9 +61986,9 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v62, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v47, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v56, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -61996,14 +62004,13 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v42 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v180 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v179 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v164 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v163 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v162 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -62014,16 +62021,16 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v128 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v103 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v102 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v60, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v61, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v44, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v45, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v46, v15 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 @@ -62034,9 +62041,9 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v41, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v176, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v177, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -62051,14 +62058,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v146 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v144 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v135 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v119 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v161 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v112 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -62069,16 +62076,16 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v134 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v118 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v181, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v183, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v40, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v165, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v166, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v167, v20 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 @@ -62089,9 +62096,9 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v151, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v132, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v133, v27 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -62106,14 +62113,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v87 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v86 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v84 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v71 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v70 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v68 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -62124,14 +62131,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v148, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v149, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v150, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v129, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v130, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v131, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -62153,16 +62160,16 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v66 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v85 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v101 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v83 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v80 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v81 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v65 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v64 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v69 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 @@ -62176,11 +62183,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v96, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v97, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v98, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v99, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v100, v32 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 @@ -62209,47 +62216,43 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-FAKE16-NEXT: .LBB39_3: ; %end ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-FAKE16-NEXT: s_clause 0x7 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:444 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:460 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB39_4: @@ -81719,9 +81722,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 @@ -81785,7 +81788,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -81897,14 +81900,13 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v28, vcc_lo ; GFX11-TRUE16-NEXT: v_add_co_u32 v29, vcc_lo, v29, 3 ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v30, null, 0, v30, vcc_lo -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v31, vcc_lo, v31, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v32, null, 0, v32, vcc_lo ; GFX11-TRUE16-NEXT: v_add_co_u32 v23, vcc_lo, v23, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v24, vcc_lo ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] @@ -82243,11 +82245,11 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24 ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v69.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h @@ -82314,8 +82316,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 ; GFX11-FAKE16-NEXT: s_clause 0x2 ; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -82555,14 +82557,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v28, vcc_lo ; GFX11-FAKE16-NEXT: v_add_co_u32 v29, vcc_lo, v29, 3 ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v30, null, 0, v30, vcc_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_add_co_u32 v31, vcc_lo, v31, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v32, null, 0, v32, vcc_lo ; GFX11-FAKE16-NEXT: v_add_co_u32 v23, vcc_lo, v23, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v24, vcc_lo ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] @@ -82924,11 +82926,12 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v82 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v81 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v80 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v70 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v69 @@ -91872,94 +91875,94 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:376 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:368 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:128 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:184 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_b32 v99, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:224 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v166, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 @@ -92005,87 +92008,88 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v49.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v53.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v70.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v70.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v52.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v99 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v83.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v84.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v84.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v85.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v87.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v160.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v65.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v54.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v161.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v53.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v52.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v49.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v48.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v39.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v31.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v31.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v160.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v166.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v166.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v167.l ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -92189,143 +92193,143 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l ; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v82.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 ; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v81.l ; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v80.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v85.h ; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v81.h ; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v55.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v96.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v54.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v83.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v83.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v84.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v85.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v69.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v70.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v69.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v71.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v50.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v65.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v66.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v36.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v54.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v53.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v49.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v34, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v38, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v51, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 @@ -92354,39 +92358,39 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 @@ -92424,36 +92428,36 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true @@ -92597,15 +92601,15 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v82.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v82.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 @@ -92619,67 +92623,67 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v81.l, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v85.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v81.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.h, v17.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v64.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v87.l, v17.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v64.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v55.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.h, v19.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v54.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v96.l, v18.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v83.l, v17.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v83.h, v17.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h @@ -92687,13 +92691,13 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v55.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v84.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v84.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v38.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 @@ -92702,15 +92706,15 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v69.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v23.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v69.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v23.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 @@ -92718,68 +92722,68 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v51.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v50.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v71.l, v24.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v51.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v50.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v65.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v66.l, v23.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v65.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v66.h, v23.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v67.l, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v37.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v52.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v54.l, v29.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v52.h, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v51 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 @@ -92789,14 +92793,14 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v39.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v48.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v48.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v49.l, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v49.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l @@ -92806,12 +92810,12 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v34, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v38, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v51, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -92819,58 +92823,64 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:616 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:612 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:608 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:604 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:596 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:592 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:516 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:492 +; GFX11-FAKE16-NEXT: s_clause 0x18 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:392 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 @@ -92880,94 +92890,94 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 ; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:376 ; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:368 ; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:128 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v112, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:72 ; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:224 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v139, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v140, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v141, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v142, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v143, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v152, off, s32 offset:328 ; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92 ; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76 @@ -92996,85 +93006,89 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v87 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v97 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v112 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v113 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v114 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v115 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v116 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v130 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v131 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v132 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v133 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v134 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v146 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v20 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v18 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v16 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v149 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v149, 8, v14 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v133, 8, v12 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v178, 8, v178 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v134, 8, v10 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v177, 8, v177 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v8 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v176 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v167 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v137 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v136 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v138 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v139 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v140 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v141 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v142 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v143 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v152 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -93169,12 +93183,12 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63 @@ -93200,26 +93214,26 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v162 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v117 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v183 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v166 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v177 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v178 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 @@ -93235,26 +93249,26 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v147 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v148 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v150 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v134 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 @@ -93270,26 +93284,26 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v70 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v114 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v97 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v99 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 @@ -93333,39 +93347,39 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 @@ -93403,36 +93417,36 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 ; GFX11-FAKE16-NEXT: .LBB58_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_4 @@ -93574,12 +93588,12 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v180, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v165, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v164, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v163, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v161, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v179, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 @@ -93625,16 +93639,16 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v162, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v135, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v128, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v119, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v117, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v145, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v144, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v129, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 @@ -93645,16 +93659,16 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v181, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v182, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v183, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v40, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v41, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v166, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v167, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v176, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v177, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v178, v26 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 @@ -93680,16 +93694,16 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v118, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v86, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v84, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v83, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v103, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v102, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v101, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v100, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v71, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 @@ -93700,16 +93714,16 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v146, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v147, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v148, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v149, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v150, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v130, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v131, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v132, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v133, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v134, v31 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 @@ -93735,13 +93749,13 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v69, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v68, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v82, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v81, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v80, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v70, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3 @@ -93755,16 +93769,16 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v112, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v113, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v114, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v115, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v116, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v87, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v96, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v97, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v98, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v99, v36 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 @@ -93793,58 +93807,64 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: .LBB58_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592 +; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:516 +; GFX11-FAKE16-NEXT: s_clause 0x18 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:592 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:596 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:604 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:608 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:612 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:616 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -97113,14 +97133,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 ; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:288 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276 @@ -97523,26 +97543,26 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v35 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 @@ -97962,30 +97982,30 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v33, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v32, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v36 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -98054,47 +98074,43 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-FAKE16-NEXT: s_clause 0x7 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:336 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:320 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 @@ -98104,88 +98120,88 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:208 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:260 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:28 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5 @@ -98203,71 +98219,64 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v46, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v47, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v181, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v182, 8, v30 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v165, 8, v31 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v96 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v97 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111 -; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v132 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v100 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v183, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v148, 8, v129 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v149 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v95 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v104 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v105 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v100, 8, v106 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v107 ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 @@ -98334,153 +98343,153 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v58 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v59 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v179 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v62 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v46 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v47 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v182 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v183 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v119 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v165 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v102 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v147 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v148 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v150 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v151 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v85 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 @@ -98489,8 +98498,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v83 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 @@ -98498,30 +98507,30 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v96 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v97 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v98 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v69 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v100 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -98693,10 +98702,10 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v57 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v43 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v160 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2 @@ -98714,9 +98723,9 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v62, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v47, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v56, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -98732,14 +98741,13 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v42 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v180 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v179 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v164 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v163 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v162 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -98750,16 +98758,16 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v128 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v103 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v102 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v60, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v61, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v44, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v45, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v46, v15 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 @@ -98770,9 +98778,9 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v41, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v176, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v177, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -98787,14 +98795,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v146 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v144 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v135 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v119 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v161 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v112 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -98805,16 +98813,16 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v134 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v118 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v181, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v183, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v40, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v165, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v166, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v167, v20 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 @@ -98825,9 +98833,9 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v151, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v132, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v133, v27 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -98842,14 +98850,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v87 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v86 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v84 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v71 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v70 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v68 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -98860,14 +98868,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v148, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v149, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v150, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v129, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v130, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v131, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -98889,16 +98897,16 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v66 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v85 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v101 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v83 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v80 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v81 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v65 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v64 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v69 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 @@ -98912,11 +98920,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v96, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v97, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v98, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v99, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v100, v32 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 @@ -98945,47 +98953,43 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-FAKE16-NEXT: .LBB59_3: ; %end ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-FAKE16-NEXT: s_clause 0x7 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:444 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:460 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB59_4: @@ -117466,9 +117470,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 @@ -117532,7 +117536,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -117965,11 +117969,11 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24 ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v69.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h @@ -118036,8 +118040,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 ; GFX11-FAKE16-NEXT: s_clause 0x2 ; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -118621,11 +118625,12 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v82 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v81 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v80 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v70 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v69 @@ -128551,94 +128556,94 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:376 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:368 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:128 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:184 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_b32 v99, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:72 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:224 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v166, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 @@ -128684,87 +128689,88 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v49.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v53.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v70.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v70.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v52.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v99 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v83.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v84.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v84.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v85.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v87.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v160.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v65.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v54.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v161.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v53.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v52.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v49.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v48.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v39.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v31.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v31.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v160.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v166.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v166.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v167.l ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -128868,143 +128874,143 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l ; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v82.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 ; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v81.l ; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v80.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v85.h ; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v81.h ; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v55.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v96.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v54.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v83.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v83.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v84.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v85.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v69.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v70.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v69.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v71.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v50.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v65.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v66.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v36.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v54.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v53.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v49.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v34, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v38, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v51, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 @@ -129033,39 +129039,39 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 @@ -129103,36 +129109,36 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_2 ; GFX11-TRUE16-NEXT: .LBB74_4: ; %cmp.true @@ -129276,15 +129282,15 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v82.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v82.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 @@ -129298,67 +129304,67 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v81.l, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v85.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v81.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.h, v17.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v64.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v87.l, v17.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v64.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v55.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.h, v19.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v54.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v96.l, v18.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v83.l, v17.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v83.h, v17.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h @@ -129366,13 +129372,13 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v55.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v84.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v84.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v38.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 @@ -129381,15 +129387,15 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v69.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v23.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v69.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v23.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 @@ -129397,68 +129403,68 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v51.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v50.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v71.l, v24.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v51.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v50.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v65.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v66.l, v23.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v65.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v66.h, v23.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v67.l, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v37.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v52.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v54.l, v29.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v52.h, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v51 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 @@ -129468,14 +129474,14 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v39.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v48.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v48.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v49.l, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v49.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l @@ -129485,12 +129491,12 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v34, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v38, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v51, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -129498,58 +129504,64 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:616 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:612 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:608 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:604 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:596 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:592 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:516 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:492 +; GFX11-FAKE16-NEXT: s_clause 0x18 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:392 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 @@ -129559,94 +129571,94 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 ; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:376 ; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:368 ; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:128 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v112, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:72 ; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:224 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v139, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v140, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v141, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v142, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v143, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v152, off, s32 offset:328 ; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92 ; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76 @@ -129675,85 +129687,89 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v87 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v97 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v112 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v113 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v114 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v115 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v116 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v130 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v131 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v132 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v133 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v134 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v146 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v20 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v18 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v16 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v149 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v149, 8, v14 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v133, 8, v12 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v178, 8, v178 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v134, 8, v10 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v177, 8, v177 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v8 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v176 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v167 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v137 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v136 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v138 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v139 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v140 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v141 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v142 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v143 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v152 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -129848,12 +129864,12 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63 @@ -129879,26 +129895,26 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v162 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v117 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v183 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v166 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v177 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v178 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 @@ -129914,26 +129930,26 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v147 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v148 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v150 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v134 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 @@ -129949,26 +129965,26 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v70 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v114 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v97 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v99 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 @@ -130012,39 +130028,39 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 @@ -130082,36 +130098,36 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 ; GFX11-FAKE16-NEXT: .LBB74_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB74_4 @@ -130253,12 +130269,12 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v180, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v165, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v164, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v163, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v161, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v179, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 @@ -130304,16 +130320,16 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v162, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v135, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v128, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v119, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v117, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v145, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v144, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v129, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 @@ -130324,16 +130340,16 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v181, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v182, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v183, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v40, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v41, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v166, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v167, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v176, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v177, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v178, v26 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 @@ -130359,16 +130375,16 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v118, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v86, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v84, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v83, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v103, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v102, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v101, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v100, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v71, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 @@ -130379,16 +130395,16 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v146, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v147, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v148, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v149, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v150, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v130, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v131, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v132, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v133, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v134, v31 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 @@ -130414,13 +130430,13 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v69, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v68, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v82, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v81, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v80, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v70, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3 @@ -130434,16 +130450,16 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v112, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v113, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v114, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v115, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v116, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v87, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v96, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v97, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v98, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v99, v36 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 @@ -130472,58 +130488,64 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: .LBB74_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592 +; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:516 +; GFX11-FAKE16-NEXT: s_clause 0x18 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:592 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:596 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:604 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:608 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:612 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:616 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -133792,14 +133814,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 ; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:288 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276 @@ -134202,26 +134224,26 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v35 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 @@ -134641,30 +134663,30 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v33, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v32, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v36 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -134733,47 +134755,43 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-FAKE16-NEXT: s_clause 0x7 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:336 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:320 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 @@ -134783,88 +134801,88 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:208 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:260 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:28 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5 @@ -134882,71 +134900,64 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v46, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v47, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v181, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v182, 8, v30 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v165, 8, v31 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v96 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v97 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111 -; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v132 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v100 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v183, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v148, 8, v129 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v149 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v95 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v104 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v105 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v100, 8, v106 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v107 ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB75_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 @@ -135013,153 +135024,153 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v58 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v59 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v179 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v62 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v46 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v47 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v182 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v183 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v119 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v165 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v102 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v147 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v148 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v150 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v151 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v85 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 @@ -135168,8 +135179,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v83 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 @@ -135177,30 +135188,30 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v96 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v97 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v98 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v69 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v100 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -135372,10 +135383,10 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v57 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v43 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v160 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2 @@ -135393,9 +135404,9 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v62, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v47, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v56, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -135411,14 +135422,13 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v42 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v180 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v179 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v164 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v163 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v162 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -135429,16 +135439,16 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v128 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v103 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v102 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v60, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v61, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v44, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v45, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v46, v15 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 @@ -135449,9 +135459,9 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v41, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v176, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v177, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -135466,14 +135476,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v146 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v144 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v135 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v119 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v161 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v112 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -135484,16 +135494,16 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v134 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v118 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v181, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v183, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v40, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v165, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v166, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v167, v20 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 @@ -135504,9 +135514,9 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v151, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v132, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v133, v27 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -135521,14 +135531,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v87 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v86 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v84 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v71 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v70 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v68 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -135539,14 +135549,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v148, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v149, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v150, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v129, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v130, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v131, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -135568,16 +135578,16 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v66 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v85 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v101 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v83 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v80 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v81 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v65 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v64 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v69 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 @@ -135591,11 +135601,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v96, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v97, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v98, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v99, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v100, v32 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 @@ -135624,47 +135634,43 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-FAKE16-NEXT: .LBB75_3: ; %end ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-FAKE16-NEXT: s_clause 0x7 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:444 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:460 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB75_4: @@ -155300,223 +155306,223 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:380 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:364 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v146, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:260 -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:252 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v135, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:236 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v129, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v131, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v130, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v117, off, s32 offset:128 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v119, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v118, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v151, off, s32 offset:384 ; GFX11-TRUE16-NEXT: scratch_load_b32 v160, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v117, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v118, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v119, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v128, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v129, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v132, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:156 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v128, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v130, off, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v131, off, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v132, off, s32 offset:184 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:192 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v151, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:76 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v12.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v5.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v29.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v148.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v147.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v146.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v134.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v129.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v131.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v130.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v119.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v112.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v151.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v97.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v100.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v103.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v103.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v114.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v114.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v115.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v115.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v116.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v117.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v117.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v117.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v128.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v117.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v132.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v119.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v128.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v128.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v129.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v128.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v130.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v131.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v132.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v133.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v134.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v147.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v150.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v31.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -155529,101 +155535,101 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB88_3: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v54.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v65.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v64.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v54.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v54.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v53.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v66.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v70.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v71.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v68.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v71.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v80.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v70.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v83.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v69.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v87.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v96.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v97.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v100.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v50.h -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v67.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v66.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v80.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v96.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v101.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v48.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v81.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v81.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v82.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v83.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v97.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v70.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v98.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v99.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v99.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v87.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v101.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v102.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v102.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v103.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v112.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v112.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v113.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v113.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v115.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v116.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v86.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v86.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v84.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v97.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v99.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v99.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v97.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v102.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v103.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v103.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v114.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v115.h ; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v112.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v118.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v118.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v119.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v114.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v113.h ; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v119.h ; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v128.l ; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v128.h @@ -155658,100 +155664,100 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v150.h ; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v151.l ; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v151.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 @@ -155789,11 +155795,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB88_2 ; GFX11-TRUE16-NEXT: .LBB88_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v39.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v116.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v39.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l @@ -155808,16 +155814,16 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v116.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v112.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v113.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v100.l, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v147.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v148.h, v1.l @@ -155827,10 +155833,10 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v102.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v101.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h @@ -155841,15 +155847,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v146.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v145.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v96.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v98.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h @@ -155865,13 +155871,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v100.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v100.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v98.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v87.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v85.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v85.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l @@ -155886,19 +155890,16 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v97.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v85.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v96.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v86.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v87.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v71.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v131.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v129.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v130.h, v1.l @@ -155908,12 +155909,10 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v71.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v96.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v66.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.l, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v130.l, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h @@ -155924,16 +155923,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v128.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v129.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v119.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v86.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v35.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v85.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v82.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l @@ -155942,7 +155940,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v128.l, v2.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v119.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v114.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v113.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v118.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v118.h, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v2.l @@ -155952,103 +155950,103 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.h, 3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v71.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v69.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v70.l, 3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v70.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v71.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v117.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v114.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v116.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v116.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v115.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v112.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v115.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v115.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v114.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v69.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v66.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.h, 3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v68.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v115.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v113.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v103.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v112.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v113.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v114.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v103.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v100.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v102.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v103.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v67.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v67.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v54.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v64.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v65.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v64.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v64.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v112.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v101.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v102.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v103.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v101.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v101.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v97.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v99.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v99.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v97.h, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v64.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v55.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v52.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v55.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v51.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v102.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v99.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v87.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v98.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v99.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v98.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v96.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v84.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v86.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v87.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v54.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v54.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v51.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v51.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v52.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v51.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v97.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v70.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v86.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v70.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v82.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v83.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v81.l, v1.h @@ -156124,207 +156122,204 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:128 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v127, off, s32 offset:384 ; GFX11-FAKE16-NEXT: scratch_load_b32 v150, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:56 ; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:64 ; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:224 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v120, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v123, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v121, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v122, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:92 ; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:20 ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v29 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v86, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v134, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v151, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v29 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v40 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v47 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v56 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v127 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v166 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v40 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v178, 8, v176 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v43 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v179 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v182 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v45 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v183 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v46 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v42 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v43 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v56 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v46 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v42, 8, v58 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v59 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v44, 8, v60 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v59 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v57 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v30 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v61 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v62 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v75 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v63 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v72 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v74 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v22 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v73 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v74 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v63 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v75 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v76 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v77 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v78 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v72 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v79 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v89 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v91 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v88 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v90 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v76, 8, v95 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v89 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v18 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v104 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v105 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v94 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v90 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v107 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v106 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v94 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v105 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v120 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v123 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v121 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v122 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -156335,10 +156330,10 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v48 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v84 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v96 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v51 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 @@ -156350,16 +156345,16 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v50 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v132 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v161 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v119 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v102 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v151 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v161 ; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v4, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v4, v7, v6, 0x5040100 @@ -156368,70 +156363,70 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v133 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v130 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v165 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v150 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v40 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v43 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v182 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v46 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v57 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v56 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v178 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v166 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v183 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v43 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v42 ; GFX11-FAKE16-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v8, v10, v9, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v9, v12, v11, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v10, v14, v13, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v11, v16, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v130 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v149 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v144 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v146 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v164 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v148 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v58 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v44 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v60 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v47 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v72 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v74 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v56 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v58 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v63 ; GFX11-FAKE16-NEXT: v_perm_b32 v12, v13, v12, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v13, v15, v14, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v14, v17, v16, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v15, v19, v18, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v16, v21, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v180 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v75 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v57 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v78 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v77 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v79 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v75 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v90 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v89 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v92 @@ -156441,16 +156436,16 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v19, v22, v21, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v23, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v101 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v45 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v99 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v93 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v88 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v104 @@ -156466,16 +156461,16 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v26, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v25, v29, v28, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v26, v31, v30, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v99 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v129 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v116 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v113 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v114 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v76 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v128 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v111 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v106 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v122 @@ -156507,94 +156502,94 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 @@ -156623,11 +156618,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB88_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v134, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v118, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v131, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v116, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v133, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v128, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v76, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v118, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v73, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 @@ -156639,15 +156634,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v124, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v33, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v98, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v116, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v61, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v98, 0x300, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v112, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v128, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v114, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v99, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v62, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v103, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v113, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 @@ -156656,38 +156651,38 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v122, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v121, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v113, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v111, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v81, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v81, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v101, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v45, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v99, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v97, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v44, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v83, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v41, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v106, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v110, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v109, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v108, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v107, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v86, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v114, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v97, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v180, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v84, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v83, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v177, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 @@ -156696,142 +156691,134 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v104, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v95, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v64, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v42, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v70, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v70, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v167, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v164, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v183, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v82, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v180, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v163, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v88, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v91, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v82, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v90, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v179, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v84, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v71, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v165, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v177, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v163, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v66, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v166, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v144, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v79, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v76, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v78, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v66, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v74, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v145, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v151, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v112, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v71, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v148, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v117, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v178, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v80, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v164, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v68, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v61, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v74, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v72, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v63, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v60, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v103, 0x300, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v162, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v64, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v149, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v144, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v147, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v147, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v148, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v62, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v58, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v56, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v47, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v46, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v119, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v119, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v130, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v135, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v145, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v130, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v132, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v133, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v135, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v117, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v44, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v57, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v43, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v46, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v42, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v183, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v45, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v129, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v182, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v114, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v116, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v96, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v96, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v101, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v102, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v103, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v100, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v55, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v43, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v179, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v41, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v178, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v55, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v165, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v37, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v37, 0x300, v0 @@ -156845,13 +156832,13 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v150, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v162, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v50, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v161, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v160, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v160, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v151, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v52, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v51, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v2 @@ -156863,15 +156850,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v36, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v132, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v131, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v115, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v119, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v102, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v113, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v39, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v38, 3 @@ -156881,11 +156868,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v71, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v82, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v70, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v66, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v87, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v96, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v98, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v86, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v81, v32 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v38, 0x300, v0 @@ -156903,28 +156890,28 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v96, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v129, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v117, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v119, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v118, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v112, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v103, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v101, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v97, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v85, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v80, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v69, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v68, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v67, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v86, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v83, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v81, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v99, v29, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v98, v30, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v101, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v117, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v112, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v80, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v69, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v64, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v68, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v67, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v71, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v66, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v65, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v84, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v82, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v70, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v83, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v97, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v114, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v99, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v85, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v113, v29, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v128, v30, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v118, v31, 0x5040100 ; GFX11-FAKE16-NEXT: .LBB88_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_clause 0x1f @@ -160865,85 +160852,85 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 ; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:288 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:260 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:192 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:172 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:168 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:152 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:144 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:64 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 @@ -161051,7 +161038,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v102 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -161060,89 +161047,89 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v112 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v115 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v129 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v150 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v146 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v118 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v132 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v128 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v144 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v133 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v130 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v148 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v20, 16, v21 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v147 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135 ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v149 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v147 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v151 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v161 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149 ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v151 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v161 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v162 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 @@ -161170,90 +161157,90 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v41 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v182 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v181 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v44 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v183 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v41 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v45 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v43 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v56 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v44 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v45 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v58 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v47 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v60 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v56 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v61 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v62 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v74 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v59 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v72 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v73 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v79 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v78 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v63 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v76 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v77 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v77 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v76 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v75 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v88 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v78 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v40 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v79 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v183 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v92 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v91 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v43 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v92 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v93 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 @@ -161305,98 +161292,98 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v92 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v93 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46 ; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v90 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v91 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v79 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v183 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v89 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v92 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v88 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v78 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v40 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v180 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v77 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v90 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v76 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v75 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v89 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v88 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v74 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v79 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v72 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v73 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v77 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v78 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v63 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v76 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v75 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v60 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v73 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v61 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v59 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v59, 0x300, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v60, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v74 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v72 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v180, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v40, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v57 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v62 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v58 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v57, 0x300, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v56 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v56, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v63 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v183, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v61 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v46, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v60 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v45 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v58 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v46 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v45, 0x300, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v44 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v59 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v43, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v43 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v56 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v41 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v45 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v42 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v47 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v40 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v183 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v44 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v42 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v41 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v181 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v181 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v182 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v179 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) @@ -161421,84 +161408,84 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v162 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v161 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v160 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v162 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v161, 0x300, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v160 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v161 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v151 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v149 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v149 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v147 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v148 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v148 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v163 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v130 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v132 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v128 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v150 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v116 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v146 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v130, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v134 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v129 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v119 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v117 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v115 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v113 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v114 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v112 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v103 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v102 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 @@ -161628,26 +161615,26 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v130 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v131 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v128, 16, v33 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v118, 16, v32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v165, 16, v33 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v161, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v160, 16, v32 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v56 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v40 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 @@ -161657,11 +161644,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v116, 16, v19 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v59, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v43, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v183, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v180, 16, v29 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v35 ; GFX11-TRUE16-NEXT: .LBB89_3: ; %end @@ -161706,38 +161693,37 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1e -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:320 +; GFX11-FAKE16-NEXT: s_clause 0x1d +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20 @@ -161747,170 +161733,163 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:152 -; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:204 ; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:240 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:244 ; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:260 +; GFX11-FAKE16-NEXT: s_clause 0xf ; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:268 ; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:312 ; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:260 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:132 -; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:28 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v7 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v103, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v101, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v4 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v145, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v162, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v163, 8, v30 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v183, 8, v183 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v56 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v46, 8, v46 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v59 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v60 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v60 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v61 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v62 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v63 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v72 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v73 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v74 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v75 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v76 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v77 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v78 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v74 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v182, 8, v180 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v40 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v43 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v41 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v167 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v45 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v176 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v57 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v177 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v47, 8, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v73 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v72 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v77 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v78 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v79 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v88 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v89 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v93 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v90 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v91 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v92 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v93 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v94 -; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v2 ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB89_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff @@ -161964,16 +161943,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v81 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v83 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v70 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v84 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v85 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53 @@ -161981,158 +161960,158 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v102 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v103 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v129 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v80 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v145 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v97 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v133 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v132 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v101 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v161 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v166 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v163 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v113 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v178 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v114 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v180 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v128 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v183 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v119 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v46 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v134 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v47 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v147 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v44 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v58 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v59 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v88 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v47 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v76 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v117 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v91 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v90 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v75 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93 @@ -162185,14 +162164,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v58 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 ; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 ; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v57 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v75 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v47 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v76 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 @@ -162200,148 +162178,140 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v46 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v43 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v61 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v40 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v59 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v90, v3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v183 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v182 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v62 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v58 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v89, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v164 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v88, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v78, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v79, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v178 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v178, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v44 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v176 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v164 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v150 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v148, 0x300, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v75, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v147 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v73, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v147, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v63, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v150 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v42 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v60, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v134 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v145 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v146 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v57, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v131 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v62, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v61, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v134, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v46, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v119 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v43, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v130 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v115 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v165 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v128 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v162 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v44, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v42, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v40, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v183, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v182, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v115 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v180, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v113 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v100 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v180, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v178, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v133 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v101 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v177, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v167, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v176, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v129 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v119 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v100, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v98 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v98, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v101, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v151 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v116 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v96 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v97 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v96, 0x300, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v163, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v99 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v130, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v86 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v82 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v86, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v161, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v160, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v103 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v80 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v54 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v53 @@ -162350,71 +162320,70 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v39 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 3, v33 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v113, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v145, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v149, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v71 ; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v101, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v102, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v131, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v132, v6 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v96 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v162, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v97, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v129, v6 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v87, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v116, v6 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v86, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v112, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v84, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v103, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v102, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v50 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v49 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v83, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v99, v7 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v48 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v82, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v87, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v38 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v81, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v5 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v80, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v83, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v84, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v37 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v81, v8 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 3, v35 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v69, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v70, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v112, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v133, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v68, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v67, v7 @@ -162458,71 +162427,70 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v116 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v98 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v86, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v101, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v115 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v134 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v118, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v114, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v182 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v117 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v28 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v178, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v100, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v147, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v118, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 ; GFX11-FAKE16-NEXT: .LBB89_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1e -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:440 +; GFX11-FAKE16-NEXT: s_clause 0x1d +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:436 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB89_4: @@ -167751,9 +167719,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:12 ; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 @@ -167865,7 +167833,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -168231,7 +168199,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v30, 0x7fff ; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v35, v27 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v103, v34, v38 :: v_dual_and_b32 v38, 0xffff0000, v32 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 @@ -168260,7 +168228,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v33, v102 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 ; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v36, v29 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v31 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v32, 16, 1 @@ -168903,7 +168870,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v128.l ; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v116.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v119.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v114.h @@ -168917,6 +168883,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v20 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v21 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v23 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 @@ -169002,8 +168969,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:12 ; GFX11-FAKE16-NEXT: s_clause 0x2 ; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -169432,7 +169399,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 ; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v51, 16, 1 ; GFX11-FAKE16-NEXT: v_add3_u32 v30, v52, v49, 0x7fff -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v32 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v51 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v39, v29, vcc_lo @@ -169445,7 +169412,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v30, 16, 1 @@ -170086,11 +170052,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v113 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v100 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v96 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v99 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v97 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v112 @@ -183442,223 +183409,223 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:380 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:364 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v146, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:260 -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:252 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v135, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:236 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v129, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v131, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v130, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v117, off, s32 offset:128 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v119, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v118, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v151, off, s32 offset:384 ; GFX11-TRUE16-NEXT: scratch_load_b32 v160, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v117, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v118, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v119, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v128, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v129, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v132, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:156 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v128, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v130, off, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v131, off, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v132, off, s32 offset:184 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:192 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v151, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:76 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v12.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v5.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v29.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v148.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v147.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v146.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v134.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v129.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v131.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v130.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v119.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v112.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v151.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v97.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v100.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v103.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v103.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v114.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v114.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v115.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v115.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v116.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v117.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v117.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v117.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v128.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v117.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v132.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v119.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v128.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v128.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v129.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v128.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v130.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v131.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v132.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v133.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v134.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v147.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v150.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v31.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -183671,101 +183638,101 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB92_3: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v54.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v65.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v64.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v54.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v54.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v53.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v66.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v70.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v71.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v68.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v71.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v80.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v70.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v83.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v69.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v87.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v96.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v97.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v100.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v50.h -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v67.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v66.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v80.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v96.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v101.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v48.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v81.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v81.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v82.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v83.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v97.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v70.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v98.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v99.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v99.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v87.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v101.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v102.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v102.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v103.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v112.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v112.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v113.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v113.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v115.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v116.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v86.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v86.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v84.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v97.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v99.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v99.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v97.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v102.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v103.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v103.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v114.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v115.h ; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v112.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v118.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v118.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v119.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v114.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v113.h ; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v119.h ; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v128.l ; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v128.h @@ -183800,100 +183767,100 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v150.h ; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v151.l ; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v151.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 @@ -183931,11 +183898,11 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB92_2 ; GFX11-TRUE16-NEXT: .LBB92_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v39.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v116.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v39.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l @@ -183950,16 +183917,16 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v116.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v112.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v113.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v100.l, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v147.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v148.h, v1.l @@ -183969,10 +183936,10 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v102.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v101.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h @@ -183983,15 +183950,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v146.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v145.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v96.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v98.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h @@ -184007,13 +183974,11 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v100.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v100.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v98.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v87.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v85.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v85.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l @@ -184028,19 +183993,16 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v97.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v85.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v96.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v86.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v87.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v71.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v131.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v129.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v130.h, v1.l @@ -184050,12 +184012,10 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v71.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v96.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v66.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.l, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v130.l, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h @@ -184066,16 +184026,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v128.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v129.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v119.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v86.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v35.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v85.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v82.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l @@ -184084,7 +184043,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v128.l, v2.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v119.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v114.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v113.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v118.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v118.h, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v2.l @@ -184094,103 +184053,103 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.h, 3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v71.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v69.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v70.l, 3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v70.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v71.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v117.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v114.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v116.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v116.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v115.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v112.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v115.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v115.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v114.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v69.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v66.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.h, 3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v68.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v115.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v113.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v103.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v112.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v113.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v114.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v103.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v100.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v102.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v103.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v67.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v67.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v54.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v64.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v65.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v64.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v64.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v112.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v101.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v102.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v103.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v101.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v101.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v97.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v99.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v99.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v97.h, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v64.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v55.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v52.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v55.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v51.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v102.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v99.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v87.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v98.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v99.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v98.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v96.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v84.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v86.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v87.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v54.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v54.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v51.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v51.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v52.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v51.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v97.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v70.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v86.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v70.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v82.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v83.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v81.l, v1.h @@ -184266,207 +184225,204 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:128 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v127, off, s32 offset:384 ; GFX11-FAKE16-NEXT: scratch_load_b32 v150, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:56 ; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:64 ; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:224 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v120, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v123, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v121, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v122, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:92 ; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:20 ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v29 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v86, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v134, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v151, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v29 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v40 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v47 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v56 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v127 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v166 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v40 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v178, 8, v176 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v43 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v179 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v182 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v45 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v183 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v46 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v42 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v43 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v56 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v46 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v42, 8, v58 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v59 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v44, 8, v60 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v59 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v57 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v30 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v61 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v62 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v75 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v63 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v72 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v74 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v22 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v73 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v74 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v63 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v75 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v76 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v77 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v78 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v72 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v79 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v89 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v91 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v88 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v90 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v76, 8, v95 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v89 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v18 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v104 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v105 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v94 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v90 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v107 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v106 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v94 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v105 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v120 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v123 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v121 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v122 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -184477,10 +184433,10 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v48 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v84 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v96 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v51 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 @@ -184492,16 +184448,16 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v50 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v132 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v161 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v119 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v102 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v151 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v161 ; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v4, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v4, v7, v6, 0x5040100 @@ -184510,70 +184466,70 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v133 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v130 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v165 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v150 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v40 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v43 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v182 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v46 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v57 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v56 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v178 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v166 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v183 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v43 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v42 ; GFX11-FAKE16-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v8, v10, v9, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v9, v12, v11, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v10, v14, v13, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v11, v16, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v130 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v149 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v144 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v146 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v164 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v148 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v58 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v44 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v60 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v47 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v72 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v74 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v56 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v58 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v63 ; GFX11-FAKE16-NEXT: v_perm_b32 v12, v13, v12, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v13, v15, v14, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v14, v17, v16, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v15, v19, v18, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v16, v21, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v180 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v75 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v57 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v78 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v77 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v79 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v75 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v90 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v89 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v92 @@ -184583,16 +184539,16 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v19, v22, v21, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v23, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v101 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v45 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v99 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v93 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v88 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v104 @@ -184608,16 +184564,16 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v26, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v25, v29, v28, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v26, v31, v30, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v99 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v129 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v116 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v113 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v114 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v76 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v128 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v111 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v106 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v122 @@ -184649,94 +184605,94 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 @@ -184765,11 +184721,11 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB92_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v134, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v118, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v131, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v116, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v133, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v128, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v76, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v118, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v73, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 @@ -184781,15 +184737,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v124, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v33, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v98, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v116, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v61, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v98, 0x300, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v112, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v128, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v114, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v99, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v62, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v103, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v113, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 @@ -184798,38 +184754,38 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v122, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v121, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v113, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v111, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v81, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v81, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v101, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v45, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v99, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v97, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v44, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v83, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v41, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v106, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v110, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v109, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v108, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v107, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v86, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v114, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v97, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v180, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v84, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v83, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v177, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 @@ -184838,142 +184794,134 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v104, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v95, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v64, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v42, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v70, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v70, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v167, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v164, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v183, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v82, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v180, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v163, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v88, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v91, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v82, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v90, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v179, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v84, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v71, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v165, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v177, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v163, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v66, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v166, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v144, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v79, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v76, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v78, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v66, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v74, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v145, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v151, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v112, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v71, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v148, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v117, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v178, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v80, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v164, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v68, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v61, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v74, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v72, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v63, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v60, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v103, 0x300, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v162, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v64, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v149, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v144, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v147, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v147, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v148, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v62, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v58, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v56, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v47, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v46, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v119, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v119, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v130, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v135, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v145, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v130, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v132, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v133, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v135, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v117, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v44, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v57, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v43, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v46, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v42, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v183, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v45, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v129, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v182, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v114, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v116, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v96, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v96, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v101, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v102, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v103, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v100, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v55, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v43, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v179, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v41, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v178, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v55, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v165, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v37, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v37, 0x300, v0 @@ -184987,13 +184935,13 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v150, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v162, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v50, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v161, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v160, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v160, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v151, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v52, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v51, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v2 @@ -185005,15 +184953,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v36, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v132, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v131, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v115, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v119, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v102, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v113, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v39, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v38, 3 @@ -185023,11 +184971,11 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v71, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v82, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v70, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v66, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v87, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v96, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v98, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v86, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v81, v32 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v38, 0x300, v0 @@ -185045,28 +184993,28 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v96, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v129, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v117, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v119, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v118, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v112, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v103, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v101, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v97, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v85, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v80, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v69, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v68, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v67, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v86, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v83, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v81, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v99, v29, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v98, v30, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v101, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v117, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v112, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v80, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v69, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v64, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v68, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v67, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v71, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v66, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v65, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v84, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v82, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v70, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v83, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v97, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v114, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v99, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v85, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v113, v29, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v128, v30, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v118, v31, 0x5040100 ; GFX11-FAKE16-NEXT: .LBB92_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_clause 0x1f @@ -188911,85 +188859,85 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 ; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:288 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:260 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:192 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:172 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:168 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:152 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:144 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:64 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 @@ -189097,7 +189045,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v102 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -189106,89 +189054,89 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v112 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v115 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v129 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v150 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v146 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v118 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v132 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v128 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v144 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v133 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v130 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v148 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v20, 16, v21 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v147 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135 ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v149 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v147 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v151 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v161 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149 ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v151 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v161 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v162 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 @@ -189216,90 +189164,90 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v41 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v182 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v181 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v44 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v183 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v41 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v45 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v43 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v56 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v44 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v45 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v58 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v47 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v60 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v56 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v61 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v62 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v74 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v59 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v72 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v73 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v79 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v78 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v63 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v76 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v77 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v77 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v76 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v75 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v88 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v78 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v40 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v79 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v183 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v92 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v91 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v43 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v92 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v93 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 @@ -189351,98 +189299,98 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v92 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v93 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46 ; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v90 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v91 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v79 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v183 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v89 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v92 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v88 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v78 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v40 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v180 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v77 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v90 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v76 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v75 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v89 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v88 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v74 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v79 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v72 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v73 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v77 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v78 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v63 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v76 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v75 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v60 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v73 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v61 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v59 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v59, 0x300, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v60, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v74 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v72 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v180, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v40, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v57 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v62 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v58 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v57, 0x300, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v56 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v56, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v63 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v183, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v61 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v46, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v60 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v45 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v58 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v46 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v45, 0x300, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v44 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v59 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v43, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v43 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v56 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v41 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v45 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v42 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v47 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v40 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v183 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v44 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v42 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v41 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v181 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v181 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v182 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v179 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) @@ -189467,84 +189415,84 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v162 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v161 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v160 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v162 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v161, 0x300, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v160 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v161 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v151 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v149 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v149 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v147 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v148 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v148 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v163 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v130 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v132 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v128 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v150 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v116 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v146 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v130, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v134 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v129 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v119 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v117 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v115 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v113 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v114 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v112 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v103 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v102 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 @@ -189674,26 +189622,26 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v130 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v131 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v128, 16, v33 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v118, 16, v32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v165, 16, v33 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v161, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v160, 16, v32 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v56 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v40 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 @@ -189703,11 +189651,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v116, 16, v19 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v59, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v43, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v183, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v180, 16, v29 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v35 ; GFX11-TRUE16-NEXT: .LBB93_3: ; %end @@ -189752,38 +189700,37 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1e -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:320 +; GFX11-FAKE16-NEXT: s_clause 0x1d +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20 @@ -189793,170 +189740,163 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:152 -; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:204 ; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:240 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:244 ; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:260 +; GFX11-FAKE16-NEXT: s_clause 0xf ; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:268 ; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:312 ; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:260 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:132 -; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:28 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v7 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v103, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v101, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v4 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v145, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v162, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v163, 8, v30 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v183, 8, v183 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v56 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v46, 8, v46 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v59 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v60 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v60 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v61 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v62 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v63 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v72 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v73 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v74 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v75 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v76 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v77 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v78 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v74 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v182, 8, v180 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v40 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v43 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v41 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v167 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v45 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v176 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v57 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v177 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v47, 8, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v73 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v72 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v77 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v78 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v79 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v88 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v89 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v93 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v90 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v91 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v92 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v93 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v94 -; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v2 ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB93_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff @@ -190010,16 +189950,16 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v81 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v83 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v70 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v84 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v85 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53 @@ -190027,158 +189967,158 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v102 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v103 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v129 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v80 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v145 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v97 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v133 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v132 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v101 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v161 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v166 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v163 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v113 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v178 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v114 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v180 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v128 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v183 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v119 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v46 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v134 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v47 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v147 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v44 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v58 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v59 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v88 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v47 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v76 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v117 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v91 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v90 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v75 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93 @@ -190231,14 +190171,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v58 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 ; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 ; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v57 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v75 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v47 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v76 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 @@ -190246,148 +190185,140 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v46 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v43 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v61 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v40 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v59 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v90, v3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v183 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v182 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v62 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v58 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v89, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v164 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v88, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v78, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v79, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v178 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v178, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v44 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v176 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v164 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v150 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v148, 0x300, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v75, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v147 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v73, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v147, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v63, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v150 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v42 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v60, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v134 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v145 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v146 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v57, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v131 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v62, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v61, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v134, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v46, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v119 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v43, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v130 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v115 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v165 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v128 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v162 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v44, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v42, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v40, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v183, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v182, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v115 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v180, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v113 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v100 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v180, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v178, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v133 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v101 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v177, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v167, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v176, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v129 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v119 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v100, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v98 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v98, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v101, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v151 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v116 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v96 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v97 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v96, 0x300, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v163, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v99 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v130, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v86 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v82 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v86, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v161, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v160, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v103 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v80 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v54 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v53 @@ -190396,71 +190327,70 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v39 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 3, v33 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v113, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v145, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v149, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v71 ; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v101, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v102, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v131, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v132, v6 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v96 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v162, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v97, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v129, v6 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v87, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v116, v6 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v86, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v112, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v84, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v103, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v102, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v50 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v49 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v83, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v99, v7 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v48 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v82, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v87, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v38 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v81, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v5 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v80, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v83, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v84, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v37 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v81, v8 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 3, v35 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v69, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v70, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v112, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v133, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v68, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v67, v7 @@ -190504,71 +190434,70 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v116 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v98 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v86, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v101, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v115 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v134 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v118, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v114, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v182 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v117 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v28 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v178, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v100, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v147, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v118, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 ; GFX11-FAKE16-NEXT: .LBB93_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1e -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:440 +; GFX11-FAKE16-NEXT: s_clause 0x1d +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:436 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB93_4: @@ -194624,9 +194553,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 @@ -194690,7 +194619,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -194767,9 +194696,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] @@ -195139,11 +195067,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24 ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v80.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v71.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v70.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h @@ -195210,8 +195138,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 ; GFX11-FAKE16-NEXT: s_clause 0x2 ; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 @@ -195418,9 +195346,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] @@ -195811,11 +195738,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v87 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v85 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v84 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v81 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v71 @@ -207157,223 +207085,223 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:380 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:364 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v146, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:260 -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:252 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v135, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:236 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v129, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v131, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v130, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v117, off, s32 offset:128 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v119, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v118, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v151, off, s32 offset:384 ; GFX11-TRUE16-NEXT: scratch_load_b32 v160, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v117, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v118, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v119, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v128, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v129, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v132, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:156 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v128, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v130, off, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v131, off, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v132, off, s32 offset:184 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:192 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v151, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:76 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v12.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v5.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v29.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v148.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v147.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v146.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v134.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v129.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v131.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v130.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v119.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v112.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v151.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v97.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v100.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v103.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v103.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v114.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v114.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v115.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v115.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v116.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v117.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v117.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v117.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v128.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v117.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v132.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v119.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v128.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v128.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v129.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v128.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v130.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v131.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v132.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v133.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v134.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v147.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v150.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v31.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -207386,101 +207314,101 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB96_3: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v54.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v65.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v64.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v54.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v54.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v53.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v66.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v70.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v71.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v68.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v71.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v80.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v70.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v83.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v69.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v87.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v96.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v97.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v100.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v50.h -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v67.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v66.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v80.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v96.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v101.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v48.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v81.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v81.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v82.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v83.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v97.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v70.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v98.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v99.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v99.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v87.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v101.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v102.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v102.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v103.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v112.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v112.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v113.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v113.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v115.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v116.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v86.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v86.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v84.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v97.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v99.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v99.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v97.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v102.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v103.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v103.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v114.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v115.h ; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v112.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v118.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v118.h ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v119.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v114.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v113.h ; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v119.h ; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v128.l ; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v128.h @@ -207515,100 +207443,100 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v150.h ; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v151.l ; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v151.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 @@ -207646,11 +207574,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB96_2 ; GFX11-TRUE16-NEXT: .LBB96_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v39.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v116.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v39.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l @@ -207665,16 +207593,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v116.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v112.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v113.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v100.l, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v147.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v148.h, v1.l @@ -207684,10 +207612,10 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v102.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v101.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h @@ -207698,15 +207626,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v146.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v145.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v96.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v98.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h @@ -207722,13 +207650,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v100.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v100.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v98.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v87.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v85.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v85.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l @@ -207743,19 +207669,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v97.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v85.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v96.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v86.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v87.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v71.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v131.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v129.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v130.h, v1.l @@ -207765,12 +207688,10 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v71.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.h, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v96.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v66.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.l, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v130.l, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h @@ -207781,16 +207702,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v128.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v129.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v119.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v86.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v35.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v85.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v82.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l @@ -207799,7 +207719,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v128.l, v2.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v119.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v114.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v113.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v118.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v118.h, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v2.l @@ -207809,103 +207729,103 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.h, 3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v71.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v69.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v70.l, 3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v70.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v71.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v117.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v114.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v116.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v116.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v115.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v112.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v115.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v115.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v114.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v69.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v66.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.h, 3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v68.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v115.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v113.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v103.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v112.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v113.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v114.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v103.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v100.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v102.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v103.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v67.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v67.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v54.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v64.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v65.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v64.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v64.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v112.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v101.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v102.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v103.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v101.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v101.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v97.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v99.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v99.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v97.h, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v64.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v55.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v52.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v55.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v51.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v102.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v99.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v87.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v98.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v99.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v98.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v96.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v84.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v86.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v87.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v54.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v54.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v51.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v51.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v52.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v51.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v97.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v70.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v86.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v70.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v82.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v83.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v81.l, v1.h @@ -207981,207 +207901,204 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:128 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v127, off, s32 offset:384 ; GFX11-FAKE16-NEXT: scratch_load_b32 v150, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:56 ; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:64 ; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:224 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v120, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v123, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v121, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v122, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:92 ; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:20 ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v29 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v86, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v134, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v151, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v29 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v40 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v47 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v56 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v127 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v166 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v40 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v178, 8, v176 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v43 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v179 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v182 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v45 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v183 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v46 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v42 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v43 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v56 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v46 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v42, 8, v58 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v59 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v44, 8, v60 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v59 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v57 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v30 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v61 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v62 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v75 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v63 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v72 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v74 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v22 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v73 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v74 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v63 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v75 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v76 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v77 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v78 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v72 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v79 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v89 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v91 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v88 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v90 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v76, 8, v95 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v89 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v18 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v104 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v105 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v94 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v90 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v107 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v106 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v94 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v105 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v120 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v123 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v121 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v122 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -208192,10 +208109,10 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v48 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v84 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v96 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v51 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 @@ -208207,16 +208124,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v50 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v132 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v161 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v119 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v102 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v151 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v161 ; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v4, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v4, v7, v6, 0x5040100 @@ -208225,70 +208142,70 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v133 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v130 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v165 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v150 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v40 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v43 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v182 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v46 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v57 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v56 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v178 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v166 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v183 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v43 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v42 ; GFX11-FAKE16-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v8, v10, v9, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v9, v12, v11, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v10, v14, v13, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v11, v16, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v130 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v149 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v144 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v146 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v164 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v148 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v58 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v44 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v60 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v47 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v72 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v74 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v56 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v58 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v63 ; GFX11-FAKE16-NEXT: v_perm_b32 v12, v13, v12, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v13, v15, v14, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v14, v17, v16, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v15, v19, v18, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v16, v21, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v180 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v75 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v57 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v78 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v77 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v79 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v75 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v90 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v89 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v92 @@ -208298,16 +208215,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v19, v22, v21, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v23, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v101 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v45 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v99 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v93 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v88 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v104 @@ -208323,16 +208240,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v26, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v25, v29, v28, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v26, v31, v30, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v99 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v129 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v116 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v113 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v114 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v76 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v128 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v111 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v106 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v122 @@ -208364,94 +208281,94 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 @@ -208480,11 +208397,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v134, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v118, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v131, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v116, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v133, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v128, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v76, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v118, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v73, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 @@ -208496,15 +208413,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v124, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v33, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v98, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v116, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v61, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v98, 0x300, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v112, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v128, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v114, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v99, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v62, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v103, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v113, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 @@ -208513,38 +208430,38 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v122, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v121, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v113, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v111, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v81, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v81, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v101, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v45, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v99, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v97, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v44, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v83, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v41, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v106, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v110, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v109, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v108, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v107, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v86, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v114, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v97, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v180, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v84, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v83, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v177, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 @@ -208553,142 +208470,134 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v104, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v95, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v64, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v42, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v70, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v70, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v167, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v164, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v183, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v82, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v180, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v163, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v88, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v91, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v82, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v90, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v179, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v84, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v71, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v165, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v177, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v163, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v66, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v166, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v144, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v79, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v76, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v78, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v66, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v74, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v145, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v151, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v112, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v71, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v148, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v117, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v178, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v80, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v164, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v68, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v61, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v74, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v72, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v63, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v60, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v103, 0x300, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v162, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v64, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v149, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v144, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v147, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v147, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v148, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v62, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v58, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v56, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v47, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v46, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v119, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v119, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v130, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v135, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v145, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v130, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v132, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v133, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v135, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v117, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v44, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v57, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v43, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v46, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v42, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v183, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v45, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v129, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v182, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v114, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v116, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v96, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v96, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v101, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v102, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v103, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v100, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v55, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v43, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v179, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v41, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v178, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v55, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v165, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v37, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v37, 0x300, v0 @@ -208702,13 +208611,13 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v150, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v162, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v50, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v161, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v160, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v160, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v151, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v52, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v51, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v2 @@ -208720,15 +208629,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v36, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v132, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v131, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v115, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v119, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v102, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v113, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v39, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v38, 3 @@ -208738,11 +208647,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v71, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v82, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v70, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v66, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v87, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v96, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v98, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v86, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v81, v32 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v38, 0x300, v0 @@ -208760,28 +208669,28 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v96, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v129, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v117, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v119, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v118, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v112, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v103, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v101, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v97, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v85, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v80, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v69, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v68, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v67, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v86, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v83, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v81, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v99, v29, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v98, v30, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v101, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v117, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v112, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v80, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v69, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v64, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v68, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v67, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v71, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v66, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v65, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v84, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v82, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v70, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v83, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v97, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v114, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v99, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v85, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v113, v29, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v128, v30, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v118, v31, 0x5040100 ; GFX11-FAKE16-NEXT: .LBB96_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_clause 0x1f @@ -212565,85 +212474,85 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 ; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:288 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:260 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:192 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:188 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:184 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:172 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:168 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:152 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:144 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:64 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 @@ -212751,7 +212660,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v102 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -212760,89 +212669,89 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v112 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v115 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v129 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v150 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v146 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v118 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v132 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v128 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v144 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v133 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v130 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v148 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v20, 16, v21 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v147 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135 ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v149 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v147 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v151 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v161 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149 ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v151 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v161 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v162 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 @@ -212870,90 +212779,90 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v41 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v182 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v181 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v44 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v183 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v41 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v45 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v43 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v56 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v44 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v45 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v58 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v47 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v60 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v56 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v61 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v62 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v74 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v59 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v72 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v73 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v79 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v78 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v63 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v76 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v77 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v77 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v76 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v75 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v88 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v78 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v40 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v79 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v183 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v92 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v91 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v43 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v92 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v93 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 @@ -213005,98 +212914,98 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v92 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v93 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46 ; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v90 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v91 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v79 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v183 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v89 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v92 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v88 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v78 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v40 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v180 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v77 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v90 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v76 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v75 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v89 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v88 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v74 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v79 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v72 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v73 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v77 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v78 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v63 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v76 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v75 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v60 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v73 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v61 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v59 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v59, 0x300, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v60, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v74 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v72 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v180, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v40, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v57 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v62 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v58 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v57, 0x300, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v56 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v56, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v63 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v183, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v61 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v46, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v60 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v45 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v58 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v46 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v45, 0x300, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v44 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v59 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v43, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v43 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v56 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v41 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v45 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v42 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v47 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v40 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v183 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v44 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v42 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v41 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v181 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v181 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v182 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v179 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) @@ -213121,84 +213030,84 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v162 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v161 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v160 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v162 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v161, 0x300, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v160 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v161 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v151 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v149 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v149 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v147 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v148 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v148 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v163 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v130 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v132 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v128 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v150 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v116 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v146 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v130, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v134 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v129 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v119 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v117 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v115 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v113 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v114 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v112 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v103 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v102 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 @@ -213328,26 +213237,26 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v130 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v131 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v128, 16, v33 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v118, 16, v32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v165, 16, v33 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v161, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v160, 16, v32 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v56 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v40 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 @@ -213357,11 +213266,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v116, 16, v19 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v59, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v43, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v183, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v180, 16, v29 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v35 ; GFX11-TRUE16-NEXT: .LBB97_3: ; %end @@ -213406,38 +213315,37 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1e -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:320 +; GFX11-FAKE16-NEXT: s_clause 0x1d +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20 @@ -213447,170 +213355,163 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:152 -; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:204 ; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:240 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:244 ; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:260 +; GFX11-FAKE16-NEXT: s_clause 0xf ; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:268 ; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:312 ; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:260 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:132 -; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:28 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v7 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v103, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v101, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v4 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v145, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v162, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v163, 8, v30 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v183, 8, v183 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v56 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v46, 8, v46 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v59 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v60 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v60 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v61 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v62 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v63 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v72 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v73 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v74 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v75 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v76 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v77 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v78 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v74 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v182, 8, v180 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v40 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v43 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v41 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v167 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v45 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v176 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v57 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v177 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v47, 8, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v73 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v72 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v77 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v78 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v79 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v88 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v89 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v93 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v90 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v91 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v92 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v93 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v94 -; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v2 ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB97_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff @@ -213664,16 +213565,16 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v81 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v83 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v70 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v84 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v85 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53 @@ -213681,158 +213582,158 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v102 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v103 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v129 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v80 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v145 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v97 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v133 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v132 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v101 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v161 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v166 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v163 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v113 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v178 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v114 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v180 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v128 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v183 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v119 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v46 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v134 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v47 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v147 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v44 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v58 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v59 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v88 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v47 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v76 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v117 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v91 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v90 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v75 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93 @@ -213885,14 +213786,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v58 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 ; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 ; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v57 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v75 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v47 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v76 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 @@ -213900,148 +213800,140 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v46 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v43 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v61 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v40 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v59 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v90, v3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v183 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v182 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v62 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v58 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v89, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v164 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v88, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v78, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v79, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v178 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v178, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v44 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v176 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v164 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v150 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v148, 0x300, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v75, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v147 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v73, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v147, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v63, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v150 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v42 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v60, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v134 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v145 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v146 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v57, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v131 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v62, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v61, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v134, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v46, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v119 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v43, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v130 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v115 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v165 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v128 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v162 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v44, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v42, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v40, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v183, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v182, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v115 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v180, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v113 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v100 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v180, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v178, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v133 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v101 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v177, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v167, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v176, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v129 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v119 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v100, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v98 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v98, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v101, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v151 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v116 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v96 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v97 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v96, 0x300, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v163, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v99 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v130, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v86 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v82 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v86, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v161, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v160, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v103 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v80 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v54 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v53 @@ -214050,71 +213942,70 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v39 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 3, v33 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v113, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v145, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v149, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v71 ; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v101, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v102, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v131, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v132, v6 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v96 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v162, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v97, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v129, v6 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v87, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v116, v6 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v86, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v112, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v84, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v103, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v102, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v50 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v49 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v83, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v99, v7 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v48 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v82, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v87, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v38 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v81, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v5 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v80, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v83, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v84, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v37 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v81, v8 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 3, v35 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v69, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v70, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v112, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v133, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v68, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v67, v7 @@ -214158,71 +214049,70 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v116 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v98 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v86, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v101, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v115 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v134 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v118, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v114, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v182 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v117 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v28 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v178, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v100, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v147, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v118, 16, v33 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 ; GFX11-FAKE16-NEXT: .LBB97_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1e -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:440 +; GFX11-FAKE16-NEXT: s_clause 0x1d +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:436 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB97_4: @@ -218482,9 +218372,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 @@ -218548,7 +218438,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -218625,9 +218515,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] @@ -218997,11 +218886,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24 ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v80.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v71.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v70.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h @@ -219068,8 +218957,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 ; GFX11-FAKE16-NEXT: s_clause 0x2 ; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 @@ -219276,9 +219165,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] @@ -219669,11 +219557,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v87 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v85 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v84 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v81 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v71 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 35ab38c67b1ec..91689b9ef3465 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -5277,15 +5277,15 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x9 -; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l @@ -5319,14 +5319,14 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v33.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -5574,15 +5574,15 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, v4 :: v_dual_mov_b32 v32, v2 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0x9 -; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v36, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:24 ; GFX11-FAKE16-NEXT: scratch_load_u16 v37, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_u16 v38, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:4 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v3 @@ -5599,17 +5599,17 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v29 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -12764,15 +12764,15 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x9 -; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l @@ -12806,14 +12806,14 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v33.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -13061,15 +13061,15 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, v4 :: v_dual_mov_b32 v32, v2 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0x9 -; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v36, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:24 ; GFX11-FAKE16-NEXT: scratch_load_u16 v37, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_u16 v38, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:4 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v3 @@ -13086,17 +13086,17 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v29 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -19860,16 +19860,16 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x9 -; GFX11-TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l @@ -19908,14 +19908,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v34.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v33.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v35.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v37 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -19945,9 +19945,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v18.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v18.h @@ -19985,9 +19985,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 @@ -20012,11 +20012,10 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l @@ -20104,16 +20103,16 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v35, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0 ; GFX11-FAKE16-NEXT: s_clause 0x9 -; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, v14 :: v_dual_mov_b32 v32, v12 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v49, 8, v3 @@ -20127,20 +20126,20 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v19 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v21 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v69, 8, v29 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v10 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -20187,13 +20186,13 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v28 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v30 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v66 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v69 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v23 @@ -20222,10 +20221,10 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 @@ -20238,7 +20237,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 @@ -20249,14 +20248,12 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-FAKE16-NEXT: .LBB50_4: ; %cmp.true -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v68, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v66, 3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v64, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v28, 3 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v68, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 @@ -20289,7 +20286,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v25, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v64, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v66, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v18, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v16, 3 @@ -26437,16 +26434,16 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x9 -; GFX11-TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l @@ -26485,14 +26482,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v34.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v33.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v35.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v37 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -26522,9 +26519,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v18.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v18.h @@ -26562,9 +26559,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 @@ -26589,11 +26586,10 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2 ; GFX11-TRUE16-NEXT: .LBB62_4: ; %cmp.true -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l @@ -26681,16 +26677,16 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v35, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0 ; GFX11-FAKE16-NEXT: s_clause 0x9 -; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, v14 :: v_dual_mov_b32 v32, v12 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v49, 8, v3 @@ -26704,20 +26700,20 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v19 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v21 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v69, 8, v29 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v10 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -26764,13 +26760,13 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v28 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v30 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v66 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v69 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v23 @@ -26799,10 +26795,10 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 @@ -26815,7 +26811,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 @@ -26826,14 +26822,12 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB62_2 ; GFX11-FAKE16-NEXT: .LBB62_4: ; %cmp.true -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v68, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v66, 3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v64, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v28, 3 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v68, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 @@ -26866,7 +26860,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v25, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v64, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v66, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v18, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v16, 3 @@ -30795,15 +30789,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x9 -; GFX11-TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l @@ -30837,14 +30831,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v36.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v37.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -31092,15 +31086,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0x9 -; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:24 ; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v1 @@ -31118,17 +31112,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v29 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v10 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -35441,15 +35435,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x9 -; GFX11-TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l @@ -35483,14 +35477,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v36.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v37.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -35738,15 +35732,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0x9 -; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:24 ; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v1 @@ -35764,17 +35758,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v29 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v6 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v10 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 65fde2fd5e190..1e24ed30fd2e4 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -12870,29 +12870,29 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:96 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:88 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 @@ -12936,40 +12936,40 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v64.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v65.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v67.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v68.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v70.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v70.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v71.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 @@ -13353,29 +13353,29 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 ; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:120 ; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:112 ; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:24 ; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:76 ; GFX11-FAKE16-NEXT: scratch_load_u16 v54, off, s32 offset:68 @@ -13403,41 +13403,40 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v29 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v14 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v65 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v66 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v67 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v81 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v82 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v83, 8, v83 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v84 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v85, 8, v85 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v128 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v131 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v129 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -15125,16 +15124,16 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12 @@ -15157,7 +15156,7 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo @@ -15278,17 +15277,17 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v39 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87 @@ -15485,19 +15484,19 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v39 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15 @@ -15544,18 +15543,18 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 ; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:48 ; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:56 ; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28 ; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12 @@ -15576,24 +15575,24 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v8 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v10 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v86 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -28229,29 +28228,29 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:96 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:88 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 @@ -28295,40 +28294,40 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v64.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v65.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v67.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v68.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v70.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v70.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v71.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 @@ -28712,29 +28711,29 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 ; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:120 ; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:112 ; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:24 ; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:76 ; GFX11-FAKE16-NEXT: scratch_load_u16 v54, off, s32 offset:68 @@ -28762,41 +28761,40 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v29 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v14 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v65 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v66 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v67 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v81 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v82 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v83, 8, v83 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v84 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v85, 8, v85 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v128 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v131 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v129 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -30484,16 +30482,16 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12 @@ -30516,7 +30514,7 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo @@ -30637,17 +30635,17 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v39 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87 @@ -30844,19 +30842,19 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v39 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15 @@ -30903,18 +30901,18 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 ; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:48 ; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:56 ; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28 ; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12 @@ -30935,24 +30933,24 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v8 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v10 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v86 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -42859,29 +42857,29 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:96 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:88 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 @@ -42925,40 +42923,40 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v64.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v65.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v67.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v68.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v70.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v70.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v71.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 @@ -43342,29 +43340,29 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 ; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:120 ; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:112 ; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:24 ; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:76 ; GFX11-FAKE16-NEXT: scratch_load_u16 v54, off, s32 offset:68 @@ -43392,41 +43390,40 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v29 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v14 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v65 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v66 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v67 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v81 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v82 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v83, 8, v83 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v84 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v85, 8, v85 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v128 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v131 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v129 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -45114,16 +45111,16 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12 @@ -45146,7 +45143,7 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo @@ -45267,17 +45264,17 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v39 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87 @@ -45474,19 +45471,19 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v39 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15 @@ -45533,18 +45530,18 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 ; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:48 ; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:56 ; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28 ; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12 @@ -45565,24 +45562,24 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v8 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v10 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v86 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB71_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -56643,29 +56640,29 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:96 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:88 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 @@ -56709,40 +56706,40 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v64.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v65.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v67.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v68.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v70.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v70.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v71.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 @@ -57126,29 +57123,29 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 ; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:120 ; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:112 ; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:24 ; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:76 ; GFX11-FAKE16-NEXT: scratch_load_u16 v54, off, s32 offset:68 @@ -57176,41 +57173,40 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v29 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v14 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v65 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v66 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v67 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v81 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v82 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v83, 8, v83 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v84 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v85, 8, v85 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v128 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v131 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v129 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -58898,16 +58894,16 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12 @@ -58930,7 +58926,7 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo @@ -59051,17 +59047,17 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v39 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87 @@ -59258,19 +59254,19 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v39 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15 @@ -59317,18 +59313,18 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 ; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:48 ; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:56 ; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28 ; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12 @@ -59349,24 +59345,24 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v8 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v10 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v86 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB87_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -70809,41 +70805,41 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:124 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:108 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:88 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l @@ -70879,18 +70875,20 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) @@ -70898,22 +70896,19 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v51.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v52.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v54.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v64.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v65 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow @@ -70941,22 +70936,22 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v23.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v23.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v20.l @@ -71005,22 +71000,22 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 @@ -71056,15 +71051,15 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 ; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v37.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v37.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v55.l, v0.l @@ -71079,17 +71074,17 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.l, v0.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v36.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v54.h, v2.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.l @@ -71097,18 +71092,18 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v35.l, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v51.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v32.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l @@ -71199,41 +71194,41 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v87, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:20 ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:4 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v49, 8, v5 @@ -71244,43 +71239,41 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v15 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v17 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v83, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v23 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v29 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v14 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v87 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v97 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v100 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v100 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v101 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v102 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v102 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v103 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v103 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v112 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v113 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v114 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v115 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v117 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -71317,8 +71310,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v53 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v52 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v83 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v71 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v3, v6, v5, 0x5040100 @@ -71329,19 +71322,19 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v28 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v30 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v82 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v19 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v97 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v87 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v29 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v112 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v103 @@ -71350,16 +71343,16 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v8, v11, v10, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v9, v13, v12, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v10, v15, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v82 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v83 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v99 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v66 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v66 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v101 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v100 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v114 @@ -71391,22 +71384,22 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 @@ -71417,15 +71410,15 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 @@ -71442,15 +71435,15 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB98_2 ; GFX11-FAKE16-NEXT: .LBB98_4: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v70, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v66, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v64, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v68, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v66, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v98, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v86, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v97, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v96, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v118, v0 @@ -71458,7 +71451,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v99, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v116, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v5 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v3 @@ -71469,46 +71462,46 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v66, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v117, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v85, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v83, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v84, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v82, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v80, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v85, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v84, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v82, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v102, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v101, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v112, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v103, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v70, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v82, 0x300, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v80, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v70, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v68, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v71, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v69, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v64, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v28, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v96, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v87, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v29, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v98, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v97, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v25, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v2 @@ -71530,9 +71523,9 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v19, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v83, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v81, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v81, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v16, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v0 @@ -71586,12 +71579,12 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v7, v19, v7, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v9, v25, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v68, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v70, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v69, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v67, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v67, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v82, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v80, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v68, v13, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v14, v66, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v64, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v65, v15, 0x5040100 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -72896,24 +72889,24 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:28 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l @@ -72944,7 +72937,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB99_4 @@ -73035,10 +73028,10 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v80 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v68 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13 @@ -73046,22 +73039,22 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v83 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v85 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v71 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11 @@ -73121,42 +73114,43 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v70 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v81 ; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v85 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v71 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v69 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v84 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v49 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v83 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v80 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 @@ -73172,7 +73166,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v50 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 @@ -73288,21 +73282,21 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:4 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 8, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v3 @@ -73320,24 +73314,24 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v8 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v10 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v85 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB99_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -73392,7 +73386,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v24 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v70 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 @@ -73406,7 +73400,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v68 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -73416,26 +73410,26 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v69 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v28 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v27 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v70 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v66 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v84 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v82 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v85 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff, v14 @@ -73451,8 +73445,8 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB99_3 ; GFX11-FAKE16-NEXT: .LBB99_2: ; %cmp.true ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v68 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v70 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v68 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v64 @@ -73462,14 +73456,14 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v16 ; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v71, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v80, v5 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff ; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 ; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v66, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v67, v6 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v26 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v28 @@ -73564,17 +73558,17 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v35 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v69 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v80 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v81 ; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v65 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v66 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v69 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v65 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v83, v2 @@ -73586,7 +73580,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v85, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v82, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 @@ -83880,41 +83874,41 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:124 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:108 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:88 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l @@ -83950,18 +83944,20 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) @@ -83969,22 +83965,19 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v51.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v52.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v54.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v64.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v65 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow @@ -84012,22 +84005,22 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v23.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v23.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v20.l @@ -84076,22 +84069,22 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 @@ -84127,15 +84120,15 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 ; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v37.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v37.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v55.l, v0.l @@ -84150,17 +84143,17 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.l, v0.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v36.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v54.h, v2.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.l @@ -84168,18 +84161,18 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v35.l, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v51.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v32.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l @@ -84270,41 +84263,41 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v87, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:20 ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:4 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v49, 8, v5 @@ -84315,43 +84308,41 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v15 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v17 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v83, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v23 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v29 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v14 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v87 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v97 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v100 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v100 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v101 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v102 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v102 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v103 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v103 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v112 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v113 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v114 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v115 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v117 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -84388,8 +84379,8 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v53 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v52 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v83 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v71 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v3, v6, v5, 0x5040100 @@ -84400,19 +84391,19 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v28 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v30 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v82 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v19 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v97 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v87 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v29 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v112 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v103 @@ -84421,16 +84412,16 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v8, v11, v10, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v9, v13, v12, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v10, v15, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v82 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v83 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v99 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v66 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v66 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v101 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v100 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v114 @@ -84462,22 +84453,22 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 @@ -84488,15 +84479,15 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 @@ -84513,15 +84504,15 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB106_2 ; GFX11-FAKE16-NEXT: .LBB106_4: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v70, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v66, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v64, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v68, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v66, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v98, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v86, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v97, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v96, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v118, v0 @@ -84529,7 +84520,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v99, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v116, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v5 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v3 @@ -84540,46 +84531,46 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v66, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v117, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v85, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v83, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v84, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v82, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v80, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v85, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v84, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v82, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v102, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v101, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v112, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v103, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v70, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v82, 0x300, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v80, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v70, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v68, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v71, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v69, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v64, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v28, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v96, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v87, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v29, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v98, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v97, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v25, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v2 @@ -84601,9 +84592,9 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v19, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v83, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v81, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v81, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v16, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v0 @@ -84657,12 +84648,12 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v7, v19, v7, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v9, v25, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v68, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v70, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v69, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v67, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v67, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v82, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v80, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v68, v13, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v14, v66, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v64, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v65, v15, 0x5040100 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -85932,24 +85923,24 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:28 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l @@ -85980,7 +85971,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB107_4 @@ -86071,10 +86062,10 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v80 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v68 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13 @@ -86082,22 +86073,22 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v83 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v85 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v71 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11 @@ -86157,42 +86148,43 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v70 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v81 ; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v85 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v71 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v69 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v84 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v49 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v83 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v80 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 @@ -86208,7 +86200,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v50 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 @@ -86324,21 +86316,21 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:4 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 8, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v3 @@ -86356,24 +86348,24 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v8 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v10 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v85 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB107_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -86428,7 +86420,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v24 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v70 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 @@ -86442,7 +86434,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v68 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -86452,26 +86444,26 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v69 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v28 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v27 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v70 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v66 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v84 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v82 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v85 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff, v14 @@ -86487,8 +86479,8 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB107_3 ; GFX11-FAKE16-NEXT: .LBB107_2: ; %cmp.true ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v68 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v70 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v68 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v64 @@ -86498,14 +86490,14 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v16 ; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v71, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v80, v5 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff ; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 ; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v66, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v67, v6 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v26 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v28 @@ -86600,17 +86592,17 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v35 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v69 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v80 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v81 ; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v65 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v66 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v69 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v65 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v83, v2 @@ -86622,7 +86614,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v85, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v82, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 @@ -95249,41 +95241,41 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:124 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:108 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:88 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l @@ -95319,18 +95311,20 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) @@ -95338,22 +95332,19 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v51.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v52.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v54.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v64.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v65 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow @@ -95381,22 +95372,22 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v23.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v23.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v20.l @@ -95445,22 +95436,22 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 @@ -95496,15 +95487,15 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 ; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v37.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v37.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v55.l, v0.l @@ -95519,17 +95510,17 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.l, v0.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v36.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v54.h, v2.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.l @@ -95537,18 +95528,18 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v35.l, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v51.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v32.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l @@ -95639,41 +95630,41 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_b32 v12, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v87, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:20 ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:4 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v3 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v49, 8, v5 @@ -95684,43 +95675,41 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v15 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v17 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v83, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v81, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v23 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v25 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v29 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v14 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v87 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v97 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v98, 8, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v97, 8, v100 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v100 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v101 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v96, 8, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v87, 8, v102 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v102 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v103 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v103 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v112, 8, v112 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v113 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v114 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v115 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v117 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -95757,8 +95746,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v53 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v52 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v83 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v71 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v3, v6, v5, 0x5040100 @@ -95769,19 +95758,19 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v28 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v30 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v82 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v19 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v97 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v87 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v29 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v112 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v103 @@ -95790,16 +95779,16 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v8, v11, v10, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v9, v13, v12, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v10, v15, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v82 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v83 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v99 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v66 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v66 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v101 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v100 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v114 @@ -95831,22 +95820,22 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 @@ -95857,15 +95846,15 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 @@ -95882,15 +95871,15 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB110_2 ; GFX11-FAKE16-NEXT: .LBB110_4: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v70, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v66, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v64, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v68, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v66, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v98, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v86, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v97, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v96, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v118, v0 @@ -95898,7 +95887,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v99, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v116, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v5 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v3 @@ -95909,46 +95898,46 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v66, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v117, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v85, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v83, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v84, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v82, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v80, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v85, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v84, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v82, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v102, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v101, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v112, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v103, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v70, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v82, 0x300, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v80, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v70, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v68, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v71, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v69, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v64, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v28, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v96, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v87, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v29, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v98, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v97, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v25, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v2 @@ -95970,9 +95959,9 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v19, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v83, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v81, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v81, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v16, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v0 @@ -96026,12 +96015,12 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v7, v19, v7, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v9, v25, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v68, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v70, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v69, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v67, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v67, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v82, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v80, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v68, v13, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v14, v66, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v64, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v65, v15, 0x5040100 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -97305,24 +97294,24 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:56 ; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:28 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l @@ -97353,7 +97342,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB111_4 @@ -97444,10 +97433,10 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v80 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v68 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13 @@ -97455,22 +97444,22 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v83 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v85 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v71 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11 @@ -97530,42 +97519,43 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v70 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v81 ; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v85 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v71 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v69 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v84 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v49 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v83 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v80 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 @@ -97581,7 +97571,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v50 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 @@ -97697,21 +97687,21 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2 ; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:4 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 8, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v3 @@ -97729,24 +97719,24 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v8 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v10 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v85 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB111_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false @@ -97801,7 +97791,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v24 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v70 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 @@ -97815,7 +97805,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v68 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v23 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -97825,26 +97815,26 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v69 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v28 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v27 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v70 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v66 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v84 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v25 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v82 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v85 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff, v14 @@ -97860,8 +97850,8 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB111_3 ; GFX11-FAKE16-NEXT: .LBB111_2: ; %cmp.true ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v68 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v70 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v68 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v64 @@ -97871,14 +97861,14 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v16 ; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v71, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v80, v5 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff ; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 ; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v66, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v67, v6 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v26 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v28 @@ -97973,17 +97963,17 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v35 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v69 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v80 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v81 ; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v65 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v66 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v69 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v65 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v83, v2 @@ -97995,7 +97985,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v85, v1 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v82, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll index 8ca3e8255b634..97b9b0b8d2786 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll @@ -103,40 +103,42 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24 ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX11-NEXT: s_mov_b32 s32, 0 ; GISEL-GFX11-NEXT: v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9 +; GISEL-GFX11-NEXT: v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11 +; GISEL-GFX11-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13 +; GISEL-GFX11-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15 ; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 4 +; GISEL-GFX11-NEXT: s_add_u32 s25, s32, 8 +; GISEL-GFX11-NEXT: s_add_u32 s26, s32, 12 +; GISEL-GFX11-NEXT: s_add_u32 s27, s32, 16 +; GISEL-GFX11-NEXT: s_add_u32 s28, s32, 20 +; GISEL-GFX11-NEXT: s_add_u32 s29, s32, 24 +; GISEL-GFX11-NEXT: s_add_u32 s30, s32, 28 +; GISEL-GFX11-NEXT: s_clause 0x7 ; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; GISEL-GFX11-NEXT: scratch_store_b32 off, v17, s24 -; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 8 -; GISEL-GFX11-NEXT: s_add_u32 s25, s32, 12 -; GISEL-GFX11-NEXT: scratch_store_b32 off, v18, s24 -; GISEL-GFX11-NEXT: scratch_store_b32 off, v19, s25 -; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 16 -; GISEL-GFX11-NEXT: s_add_u32 s25, s32, 20 -; GISEL-GFX11-NEXT: scratch_store_b32 off, v20, s24 -; GISEL-GFX11-NEXT: scratch_store_b32 off, v21, s25 -; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 24 -; GISEL-GFX11-NEXT: s_add_u32 s25, s32, 28 -; GISEL-GFX11-NEXT: scratch_store_b32 off, v22, s24 -; GISEL-GFX11-NEXT: scratch_store_b32 off, v23, s25 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v18, s25 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v19, s26 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v20, s27 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v21, s28 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v22, s29 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v23, s30 ; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 32 ; GISEL-GFX11-NEXT: s_add_u32 s25, s32, 36 +; GISEL-GFX11-NEXT: s_add_u32 s26, s32, 40 +; GISEL-GFX11-NEXT: s_add_u32 s27, s32, 44 +; GISEL-GFX11-NEXT: s_add_u32 s28, s32, 48 +; GISEL-GFX11-NEXT: s_add_u32 s29, s32, 52 +; GISEL-GFX11-NEXT: s_add_u32 s30, s32, 56 +; GISEL-GFX11-NEXT: s_add_u32 s31, s32, 60 +; GISEL-GFX11-NEXT: s_clause 0x7 ; GISEL-GFX11-NEXT: scratch_store_b32 off, v24, s24 ; GISEL-GFX11-NEXT: scratch_store_b32 off, v25, s25 -; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 40 -; GISEL-GFX11-NEXT: v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11 -; GISEL-GFX11-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13 -; GISEL-GFX11-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15 -; GISEL-GFX11-NEXT: s_add_u32 s25, s32, 44 -; GISEL-GFX11-NEXT: scratch_store_b32 off, v26, s24 -; GISEL-GFX11-NEXT: scratch_store_b32 off, v27, s25 -; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 48 -; GISEL-GFX11-NEXT: s_add_u32 s25, s32, 52 -; GISEL-GFX11-NEXT: scratch_store_b32 off, v28, s24 -; GISEL-GFX11-NEXT: scratch_store_b32 off, v29, s25 -; GISEL-GFX11-NEXT: s_add_u32 s24, s32, 56 -; GISEL-GFX11-NEXT: s_add_u32 s25, s32, 60 -; GISEL-GFX11-NEXT: scratch_store_b32 off, v30, s24 -; GISEL-GFX11-NEXT: scratch_store_b32 off, v31, s25 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v26, s26 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v27, s27 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v28, s28 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v29, s29 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v30, s30 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v31, s31 ; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GISEL-GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GISEL-GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 @@ -231,40 +233,42 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24 ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX11-NEXT: s_mov_b32 s32, 0 ; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v32, v15 :: v_dual_mov_b32 v33, v14 +; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v34, v13 :: v_dual_mov_b32 v35, v12 +; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v36, v11 :: v_dual_mov_b32 v37, v10 +; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v38, v9 :: v_dual_mov_b32 v39, v8 ; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 60 +; DAGISEL-GFX11-NEXT: s_add_i32 s25, s32, 56 +; DAGISEL-GFX11-NEXT: s_add_i32 s26, s32, 52 +; DAGISEL-GFX11-NEXT: s_add_i32 s27, s32, 48 +; DAGISEL-GFX11-NEXT: s_add_i32 s28, s32, 44 +; DAGISEL-GFX11-NEXT: s_add_i32 s29, s32, 40 +; DAGISEL-GFX11-NEXT: s_add_i32 s30, s32, 36 +; DAGISEL-GFX11-NEXT: s_clause 0x7 ; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v31, s24 -; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 56 -; DAGISEL-GFX11-NEXT: s_add_i32 s25, s32, 52 -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v30, s24 -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v29, s25 -; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 48 -; DAGISEL-GFX11-NEXT: s_add_i32 s25, s32, 44 -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v28, s24 -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v27, s25 -; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 40 -; DAGISEL-GFX11-NEXT: s_add_i32 s25, s32, 36 -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v26, s24 -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v25, s25 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v30, s25 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v29, s26 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v28, s27 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v27, s28 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v26, s29 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v25, s30 ; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 32 ; DAGISEL-GFX11-NEXT: s_add_i32 s25, s32, 28 +; DAGISEL-GFX11-NEXT: s_add_i32 s26, s32, 24 +; DAGISEL-GFX11-NEXT: s_add_i32 s27, s32, 20 +; DAGISEL-GFX11-NEXT: s_add_i32 s28, s32, 16 +; DAGISEL-GFX11-NEXT: s_add_i32 s29, s32, 12 +; DAGISEL-GFX11-NEXT: s_add_i32 s30, s32, 8 +; DAGISEL-GFX11-NEXT: s_add_i32 s31, s32, 4 +; DAGISEL-GFX11-NEXT: s_clause 0x7 ; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v24, s24 ; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v23, s25 -; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 24 -; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v34, v13 :: v_dual_mov_b32 v35, v12 -; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v36, v11 :: v_dual_mov_b32 v37, v10 -; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v38, v9 :: v_dual_mov_b32 v39, v8 -; DAGISEL-GFX11-NEXT: s_add_i32 s25, s32, 20 -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v22, s24 -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v21, s25 -; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 16 -; DAGISEL-GFX11-NEXT: s_add_i32 s25, s32, 12 -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v20, s24 -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v19, s25 -; DAGISEL-GFX11-NEXT: s_add_i32 s24, s32, 8 -; DAGISEL-GFX11-NEXT: s_add_i32 s25, s32, 4 -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v18, s24 -; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v17, s25 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v22, s26 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v21, s27 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v20, s28 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v19, s29 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v18, s30 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v17, s31 ; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll index e1bbc243344b0..ec2d3f5b1fd6b 100644 --- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll @@ -15,9 +15,8 @@ declare void @llvm.amdgcn.s.barrier() #2 ; SI-ALLOCA: buffer_load_dword [[LOAD_A:v[0-9]+]] ; SI-ALLOCA: buffer_load_dword [[LOAD_B:v[0-9]+]] -; SI-ALLOCA: v_lshlrev_b32_e32 [[SIZE_SCALE:v[0-9]+]], 2, [[LOAD_A]] +; SI-ALLOCA: v_lshlrev_b32_e32 [[PTRREG:v[0-9]+]], 2, [[LOAD_B]] -; SI-ALLOCA: v_mov_b32_e32 [[PTRREG:v[0-9]+]], [[SIZE_SCALE]] ; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64 ; SI-ALLOCA: s_barrier ; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64 diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll index 51caa84450ff3..f546fb46acc00 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll @@ -371,21 +371,22 @@ define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s15 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: s_add_u32 s16, s8, 8 +; GFX8-NEXT: s_addc_u32 s17, s9, 0 +; GFX8-NEXT: s_getpc_b64 s[18:19] +; GFX8-NEXT: s_add_u32 s18, s18, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s19, s19, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dword s15, s[8:9], 0x0 -; GFX8-NEXT: s_add_u32 s8, s8, 8 -; GFX8-NEXT: s_addc_u32 s9, s9, 0 -; GFX8-NEXT: s_getpc_b64 s[16:17] -; GFX8-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX8-NEXT: s_mov_b64 s[8:9], s[16:17] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s15 ; GFX8-NEXT: s_mov_b32 s32, 0 -; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX8-NEXT: s_endpgm ; ; GFX8-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel: @@ -394,16 +395,16 @@ define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr ; GFX8-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8 ; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s13, s9 ; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0 +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX8-ARCH-FLAT-NEXT: s_getpc_b64 s[6:7] +; GFX8-ARCH-FLAT-NEXT: s_add_u32 s6, s6, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s7, s7, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 ; GFX8-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 -; GFX8-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5] -; GFX8-ARCH-FLAT-NEXT: s_add_u32 s4, s4, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 -; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 -; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 -; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3] @@ -417,20 +418,21 @@ define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s15 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_add_u32 s16, s8, 8 +; GFX9-NEXT: s_addc_u32 s17, s9, 0 +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dword s15, s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 s8, s8, 8 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[8:9], s[16:17] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s15 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: s_endpgm ; ; GFX9-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel: @@ -439,15 +441,15 @@ define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr ; GFX9-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8 ; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s13, s9 ; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-ARCH-FLAT-NEXT: s_getpc_b64 s[6:7] +; GFX9-ARCH-FLAT-NEXT: s_add_u32 s6, s6, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s7, s7, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 ; GFX9-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 -; GFX9-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5] -; GFX9-ARCH-FLAT-NEXT: s_add_u32 s4, s4, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 -; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 -; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 -; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-ARCH-FLAT-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3] @@ -463,13 +465,13 @@ define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr ; GFX942-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8 ; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s13, s9 ; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0 -; GFX942-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 -; GFX942-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5] -; GFX942-ARCH-FLAT-NEXT: s_add_u32 s4, s4, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 -; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 -; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 ; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX942-ARCH-FLAT-NEXT: s_getpc_b64 s[6:7] +; GFX942-ARCH-FLAT-NEXT: s_add_u32 s6, s6, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s7, s7, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX942-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 +; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v31, v0 @@ -483,20 +485,22 @@ define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s0, s0, s15 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_load_dword s15, s[8:9], 0x0 -; GFX10-NEXT: s_add_u32 s8, s8, 8 -; GFX10-NEXT: s_addc_u32 s9, s9, 0 -; GFX10-NEXT: s_getpc_b64 s[16:17] -; GFX10-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX10-NEXT: s_add_u32 s16, s8, 8 +; GFX10-NEXT: s_addc_u32 s17, s9, 0 +; GFX10-NEXT: s_getpc_b64 s[18:19] +; GFX10-NEXT: s_add_u32 s18, s18, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s19, s19, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s15, s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX10-NEXT: s_mov_b64 s[8:9], s[16:17] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s15 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX10-NEXT: s_endpgm call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) ret void @@ -736,21 +740,22 @@ define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s15 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: s_add_u32 s16, s8, 8 +; GFX8-NEXT: s_addc_u32 s17, s9, 0 +; GFX8-NEXT: s_getpc_b64 s[18:19] +; GFX8-NEXT: s_add_u32 s18, s18, calls_intrin_ascast@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s19, s19, calls_intrin_ascast@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dword s15, s[8:9], 0x0 -; GFX8-NEXT: s_add_u32 s8, s8, 8 -; GFX8-NEXT: s_addc_u32 s9, s9, 0 -; GFX8-NEXT: s_getpc_b64 s[16:17] -; GFX8-NEXT: s_add_u32 s16, s16, calls_intrin_ascast@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s17, s17, calls_intrin_ascast@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX8-NEXT: s_mov_b64 s[8:9], s[16:17] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s15 ; GFX8-NEXT: s_mov_b32 s32, 0 -; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX8-NEXT: s_endpgm ; ; GFX8-ARCH-FLAT-LABEL: call_calls_intrin_ascast_cc_kernel: @@ -759,16 +764,16 @@ define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) % ; GFX8-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8 ; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s13, s9 ; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0 +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX8-ARCH-FLAT-NEXT: s_getpc_b64 s[6:7] +; GFX8-ARCH-FLAT-NEXT: s_add_u32 s6, s6, calls_intrin_ascast@gotpcrel32@lo+4 +; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s7, s7, calls_intrin_ascast@gotpcrel32@hi+12 ; GFX8-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 -; GFX8-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5] -; GFX8-ARCH-FLAT-NEXT: s_add_u32 s4, s4, calls_intrin_ascast@gotpcrel32@lo+4 -; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, calls_intrin_ascast@gotpcrel32@hi+12 -; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 -; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3] @@ -782,20 +787,21 @@ define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s15 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_add_u32 s16, s8, 8 +; GFX9-NEXT: s_addc_u32 s17, s9, 0 +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, calls_intrin_ascast@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, calls_intrin_ascast@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dword s15, s[8:9], 0x0 -; GFX9-NEXT: s_add_u32 s8, s8, 8 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, calls_intrin_ascast@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, calls_intrin_ascast@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[8:9], s[16:17] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s15 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: s_endpgm ; ; GFX9-ARCH-FLAT-LABEL: call_calls_intrin_ascast_cc_kernel: @@ -804,15 +810,15 @@ define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) % ; GFX9-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8 ; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s13, s9 ; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-ARCH-FLAT-NEXT: s_getpc_b64 s[6:7] +; GFX9-ARCH-FLAT-NEXT: s_add_u32 s6, s6, calls_intrin_ascast@gotpcrel32@lo+4 +; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s7, s7, calls_intrin_ascast@gotpcrel32@hi+12 ; GFX9-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 -; GFX9-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5] -; GFX9-ARCH-FLAT-NEXT: s_add_u32 s4, s4, calls_intrin_ascast@gotpcrel32@lo+4 -; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, calls_intrin_ascast@gotpcrel32@hi+12 -; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 -; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-ARCH-FLAT-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3] @@ -828,13 +834,13 @@ define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) % ; GFX942-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8 ; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s13, s9 ; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0 -; GFX942-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 -; GFX942-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5] -; GFX942-ARCH-FLAT-NEXT: s_add_u32 s4, s4, calls_intrin_ascast@gotpcrel32@lo+4 -; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, calls_intrin_ascast@gotpcrel32@hi+12 -; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 ; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX942-ARCH-FLAT-NEXT: s_getpc_b64 s[6:7] +; GFX942-ARCH-FLAT-NEXT: s_add_u32 s6, s6, calls_intrin_ascast@gotpcrel32@lo+4 +; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s7, s7, calls_intrin_ascast@gotpcrel32@hi+12 +; GFX942-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 +; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v31, v0 @@ -848,20 +854,22 @@ define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s0, s0, s15 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_load_dword s15, s[8:9], 0x0 -; GFX10-NEXT: s_add_u32 s8, s8, 8 -; GFX10-NEXT: s_addc_u32 s9, s9, 0 -; GFX10-NEXT: s_getpc_b64 s[16:17] -; GFX10-NEXT: s_add_u32 s16, s16, calls_intrin_ascast@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s17, s17, calls_intrin_ascast@gotpcrel32@hi+12 +; GFX10-NEXT: s_add_u32 s16, s8, 8 +; GFX10-NEXT: s_addc_u32 s17, s9, 0 +; GFX10-NEXT: s_getpc_b64 s[18:19] +; GFX10-NEXT: s_add_u32 s18, s18, calls_intrin_ascast@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s19, s19, calls_intrin_ascast@gotpcrel32@hi+12 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s15, s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX10-NEXT: s_mov_b64 s[8:9], s[16:17] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s15 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX10-NEXT: s_endpgm call void @calls_intrin_ascast(ptr addrspace(3) %ptr) ret void diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index c4957fd44e2be..338ea4a133e48 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -2140,6 +2140,7 @@ define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) { ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, 0x3f80 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.h, 0x4228 +; GFX11TRUE16-NEXT: s_clause 0x1 ; GFX11TRUE16-NEXT: global_store_b16 v[0:1], v4, off ; GFX11TRUE16-NEXT: global_store_d16_hi_b16 v[2:3], v4, off ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -2149,6 +2150,7 @@ define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) { ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: v_mov_b32_e32 v4, 0x3f80 ; GFX11FAKE16-NEXT: v_mov_b32_e32 v5, 0x4228 +; GFX11FAKE16-NEXT: s_clause 0x1 ; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v4, off ; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v5, off ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -43111,32 +43113,32 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11TRUE16-NEXT: s_clause 0x1f ; GFX11TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 -; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:68 -; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:72 -; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:124 -; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:128 -; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:64 -; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:60 +; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:124 +; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:128 +; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:60 +; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:64 +; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:112 +; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:116 ; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:120 ; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:56 -; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:116 +; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:48 ; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:52 -; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:112 -; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:48 +; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:100 +; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:104 ; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:108 ; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:44 -; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:104 +; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:36 ; GFX11TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:40 -; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:100 -; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:36 -; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:96 -; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:32 +; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:96 +; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:32 +; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:84 +; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:88 ; GFX11TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:92 ; GFX11TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:28 -; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:88 +; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:20 ; GFX11TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:24 -; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:84 -; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:20 +; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:68 +; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:72 ; GFX11TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:76 ; GFX11TRUE16-NEXT: scratch_load_b32 v83, off, s32 offset:80 ; GFX11TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:16 @@ -43207,55 +43209,52 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v26 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32) ; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v31 -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v35.l, v36.l, s26 -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26) -; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v34.l, v37.l, s27 -; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v34.h, v37.h, s28 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(29) +; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v32.l, v34.l, s27 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(28) +; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v35.l, s26 +; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v32.h, v34.h, s28 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(24) ; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v38.l, v39.l, s29 ; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v38.h, v39.h, s25 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(22) -; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v48.l, v49.l, s24 -; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v48.h, v49.h, s23 -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v50.l, v51.l, s22 -; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v50.h, v51.h, s21 +; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v37.l, v49.l, s24 +; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v37.h, v49.h, s23 +; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v36.l, v48.l, s22 +; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v36.h, v48.h, s21 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(18) ; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v52.l, v53.l, s20 ; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v52.h, v53.h, s19 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v54.l, v55.l, s18 -; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v54.h, v55.h, s17 +; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v51.l, v55.l, s18 +; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v51.h, v55.h, s17 +; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v50.l, v54.l, s16 +; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v50.h, v54.h, s15 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v64.l, v65.l, s16 -; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v64.h, v65.h, s15 -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11TRUE16-NEXT: v_cndmask_b16 v7.l, v66.l, v67.l, s14 -; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v66.h, v67.h, s13 +; GFX11TRUE16-NEXT: v_cndmask_b16 v7.l, v64.l, v65.l, s14 +; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v64.h, v65.h, s13 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(10) ; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v68.l, v69.l, s12 ; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v68.h, v69.h, s11 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v70.l, v71.l, s10 -; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v70.h, v71.h, s9 -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v80.l, v81.l, s8 -; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v80.h, v81.h, s7 +; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v67.l, v71.l, s10 +; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v67.h, v71.h, s9 +; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v66.l, v70.l, s8 +; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v66.h, v70.h, s7 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(3) ; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v83.l, v84.l, s6 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v82.l, v85.l, s4 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v33.l, v86.l, s2 +; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v81.l, v86.l, s2 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v32.l, v87.l, s0 +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v80.l, v87.l, s0 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v16 -; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v32.h, v87.h, vcc_lo -; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v33.h, v86.h, s1 +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v80.h, v87.h, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v81.h, v86.h, s1 ; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v82.h, v85.h, s3 ; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v83.h, v84.h, s5 -; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v35.h, v36.h, s0 +; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v33.h, v35.h, s0 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_vselect_v32bf16: diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index d51e47bfb8d4f..297ce4e2543ea 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -560,8 +560,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr17, 16, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_CSELECT_B64 -1, 0, implicit killed $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_XOR_B64 renamable $sgpr62_sgpr63, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr50_sgpr51, implicit-def dead $scc @@ -654,7 +654,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.48.bb63: ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 + ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc @@ -668,7 +668,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.50.bb68: ; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec ; GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec @@ -696,7 +696,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.52.bb80: ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc @@ -710,7 +710,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF @@ -724,7 +724,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.54.bb73: ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr6 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1) ; GFX90A-NEXT: renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec @@ -756,22 +756,22 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.56.bb90: ; GFX90A-NEXT: successors: %bb.60(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec ; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr11 = COPY renamable $sgpr22, implicit $exec + ; GFX90A-NEXT: renamable $vgpr12 = COPY renamable $sgpr21, implicit $exec + ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr12, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec - ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec - ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr11, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr62_sgpr63, implicit $exec ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr46, implicit $exec ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr47, killed $vgpr10, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.60 ; GFX90A-NEXT: {{ $}} @@ -815,13 +815,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr1 = COPY renamable $sgpr23, implicit $exec + ; GFX90A-NEXT: renamable $vgpr4 = COPY renamable $sgpr21, implicit $exec + ; GFX90A-NEXT: renamable $vgpr5 = COPY killed renamable $sgpr33, implicit $exec ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec - ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.419, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec - ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr33, implicit $exec - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.420, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr1, 0, 0, implicit $exec :: (load (s64) from %ir.419, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr4, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.420, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 @@ -829,14 +829,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.59.bb85: ; GFX90A-NEXT: successors: %bb.56(0x40000000), %bb.60(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr8 = V_OR_B32_e32 1, $vgpr6, implicit $exec ; GFX90A-NEXT: renamable $vgpr9 = COPY renamable $vgpr7, implicit $exec ; GFX90A-NEXT: renamable $vgpr10 = FLAT_LOAD_UBYTE renamable $vgpr8_vgpr9, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86) ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr10, implicit $exec - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF @@ -850,20 +850,20 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.60.Flow31: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr54_sgpr55, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.61.Flow30: ; GFX90A-NEXT: successors: %bb.55(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr52_sgpr53, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.55 ; GFX90A-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index 178b138b57141..b332c411cc715 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -5577,12 +5577,13 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s22, s32, 8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s21 :: v_dual_mov_b32 v1, s20 +; GFX11-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21 ; GFX11-NEXT: v_mov_b32_e32 v2, s19 ; GFX11-NEXT: s_add_i32 s19, s32, 4 ; GFX11-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v7, s43 -; GFX11-NEXT: scratch_store_b32 off, v0, s22 -; GFX11-NEXT: scratch_store_b32 off, v1, s19 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_store_b32 off, v0, s19 +; GFX11-NEXT: scratch_store_b32 off, v1, s22 ; GFX11-NEXT: scratch_store_b32 off, v2, s32 ; GFX11-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v3, s39 ; GFX11-NEXT: v_dual_mov_b32 v1, s37 :: v_dual_mov_b32 v2, s38 @@ -6062,6 +6063,7 @@ define void @stack_12xv3i32() #0 { ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 16 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: scratch_store_b32 off, v4, s0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0 @@ -6403,6 +6405,7 @@ define void @stack_12xv3f32() #0 { ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 16 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: scratch_store_b32 off, v4, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 @@ -6773,6 +6776,7 @@ define void @stack_8xv5i32() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b32 off, v8, s0 ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s1 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, 0 @@ -7147,6 +7151,7 @@ define void @stack_8xv5f32() #0 { ; GFX11-NEXT: s_add_i32 s0, s32, 32 ; GFX11-NEXT: s_add_i32 s1, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: scratch_store_b32 off, v8, s0 ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s1 diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index 9f48c8b5fe49c..c5b34bd805318 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -109,6 +109,7 @@ define <2 x half> @chain_hi_to_lo_private_different_bases(ptr addrspace(5) %base ; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private_different_bases: ; FLATSCR_GFX10: ; %bb.0: ; %bb ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR_GFX10-NEXT: s_clause 0x1 ; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, v0, off ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, v1, off ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) @@ -125,6 +126,7 @@ define <2 x half> @chain_hi_to_lo_private_different_bases(ptr addrspace(5) %base ; GFX11-FAKE16-LABEL: chain_hi_to_lo_private_different_bases: ; GFX11-FAKE16: ; %bb.0: ; %bb ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: scratch_load_u16 v0, v0, off ; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v0, v1, off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -372,6 +374,7 @@ define <2 x half> @chain_hi_to_lo_global_different_bases(ptr addrspace(1) %base_ ; GFX10-LABEL: chain_hi_to_lo_global_different_bases: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: global_load_short_d16_hi v0, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -388,6 +391,7 @@ define <2 x half> @chain_hi_to_lo_global_different_bases(ptr addrspace(1) %base_ ; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_different_bases: ; GFX11-FAKE16: ; %bb.0: ; %bb ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off ; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll index f7c58ca9599b4..1984c0205633c 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -139,6 +139,7 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX11-NEXT: v_max_f32_e64 v2, v1, v1 clamp +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index 3e0837b58aafc..702d26e4c14ad 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -506,6 +506,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; GFX11-NEXT: v_max_f32_e32 v1, 0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_f32_e32 v2, 1.0, v1 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll index 986dd8a046424..eb762c9ad22de 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -331,56 +331,6 @@ entry: ret void } -; Don't cluster loads from different textures -; DBG-LABEL: no_cluster_image_load: -; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: LocationSize::precise(16) -; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: LocationSize::precise(16) -; DBG-NOT: {{^}}Cluster ld/st -define amdgpu_ps void @no_cluster_image_load(<8 x i32> inreg %src1, <8 x i32> inreg %src2, <8 x i32> inreg %dst, i32 %x, i32 %y) { -; GFX9-LABEL: no_cluster_image_load: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: image_load_mip v[3:6], v[0:2], s[0:7] dmask:0xf unorm -; GFX9-NEXT: image_load_mip v[7:10], v[0:2], s[8:15] dmask:0xf unorm -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v6, v6, v10 -; GFX9-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX9-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX9-NEXT: image_store v[3:6], v[0:1], s[16:23] dmask:0xf unorm -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: no_cluster_image_load: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: v_mov_b32_e32 v10, 0 -; GFX10-NEXT: image_load_mip v[2:5], [v0, v1, v10], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm -; GFX10-NEXT: image_load_mip v[6:9], [v0, v1, v10], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX10-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-NEXT: image_store v[2:5], v[0:1], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D unorm -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: no_cluster_image_load: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: image_load_mip v[2:5], [v0, v1, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm -; GFX11-NEXT: image_load_mip v[6:9], [v0, v1, v6], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8 -; GFX11-NEXT: v_dual_add_f32 v3, v3, v7 :: v_dual_add_f32 v2, v2, v6 -; GFX11-NEXT: image_store v[2:5], v[0:1], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D unorm -; GFX11-NEXT: s_endpgm -entry: - %val1 = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %x, i32 %y, i32 0, <8 x i32> %src1, i32 0, i32 0) - %val2 = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %x, i32 %y, i32 0, <8 x i32> %src2, i32 0, i32 0) - %val = fadd fast <4 x float> %val1, %val2 - call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %val, i32 15, i32 %x, i32 %y, <8 x i32> %dst, i32 0, i32 0) - ret void -} - ; Cluster loads from the same texture and sampler with different coordinates ; DBG-LABEL: cluster_image_sample: ; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: LocationSize::precise(16) diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll index 52ccfe8ba3bfb..497380ff0ae6e 100644 --- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -1,17 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SICIVI,SICI,SI %s ; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SICIVI,SICI %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,SICIVI,VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; GCN-LABEL: {{^}}load_i32: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 -; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2 -; GFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 -; GFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 define amdgpu_vs float @load_i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 { +; SICI-LABEL: load_i32: +; SICI: ; %bb.0: +; SICI-NEXT: s_mov_b32 s3, 0 +; SICI-NEXT: s_mov_b32 s2, s1 +; SICI-NEXT: s_mov_b32 s1, s3 +; SICI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SICI-NEXT: s_load_dword s1, s[2:3], 0x2 +; SICI-NEXT: s_waitcnt lgkmcnt(0) +; SICI-NEXT: s_add_i32 s0, s0, s1 +; SICI-NEXT: v_mov_b32_e32 v0, s0 +; SICI-NEXT: ; return to shader part epilog +; +; VI-LABEL: load_i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s3, 0 +; VI-NEXT: s_mov_b32 s2, s1 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_load_dword s0, s[0:1], 0x0 +; VI-NEXT: s_load_dword s1, s[2:3], 0x8 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: ; return to shader part epilog %gep1 = getelementptr inbounds i32, ptr addrspace(6) %p1, i32 2 %r0 = load i32, ptr addrspace(6) %p0 %r1 = load i32, ptr addrspace(6) %gep1 @@ -20,20 +48,48 @@ define amdgpu_vs float @load_i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) in ret float %r2 } -; GCN-LABEL: {{^}}load_v2i32: -; SICIVI-DAG: s_mov_b32 s3, 0 -; SICIVI-DAG: s_mov_b32 s2, s1 -; SICIVI-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4 -; VI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; VI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 -; GFX9-DAG: s_mov_b32 s2, s1 -; GFX9-DAG: s_mov_b32 s3, 0 -; GFX9-DAG: s_mov_b32 s1, s3 -; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 define amdgpu_vs <2 x float> @load_v2i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 { +; SICI-LABEL: load_v2i32: +; SICI: ; %bb.0: +; SICI-NEXT: s_mov_b32 s3, 0 +; SICI-NEXT: s_mov_b32 s2, s1 +; SICI-NEXT: s_mov_b32 s1, s3 +; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; SICI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4 +; SICI-NEXT: s_waitcnt lgkmcnt(0) +; SICI-NEXT: s_add_i32 s0, s0, s2 +; SICI-NEXT: s_add_i32 s1, s1, s3 +; SICI-NEXT: v_mov_b32_e32 v0, s0 +; SICI-NEXT: v_mov_b32_e32 v1, s1 +; SICI-NEXT: ; return to shader part epilog +; +; VI-LABEL: load_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s3, 0 +; VI-NEXT: s_mov_b32 s2, s1 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x10 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s0, s0, s2 +; VI-NEXT: s_add_i32 s1, s1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s0, s4, s6 +; GFX9-NEXT: s_add_i32 s1, s5, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: ; return to shader part epilog %gep1 = getelementptr inbounds <2 x i32>, ptr addrspace(6) %p1, i32 2 %r0 = load <2 x i32>, ptr addrspace(6) %p0 %r1 = load <2 x i32>, ptr addrspace(6) %gep1 @@ -42,17 +98,60 @@ define amdgpu_vs <2 x float> @load_v2i32(ptr addrspace(6) inreg %p0, ptr addrspa ret <2 x float> %r2 } -; GCN-LABEL: {{^}}load_v4i32: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 -; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8 -; VI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 -; VI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 -; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 -; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 define amdgpu_vs <4 x float> @load_v4i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 { +; SICI-LABEL: load_v4i32: +; SICI: ; %bb.0: +; SICI-NEXT: s_mov_b32 s5, 0 +; SICI-NEXT: s_mov_b32 s4, s1 +; SICI-NEXT: s_mov_b32 s1, s5 +; SICI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SICI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x8 +; SICI-NEXT: s_waitcnt lgkmcnt(0) +; SICI-NEXT: s_add_i32 s0, s0, s4 +; SICI-NEXT: s_add_i32 s1, s1, s5 +; SICI-NEXT: s_add_i32 s2, s2, s6 +; SICI-NEXT: s_add_i32 s3, s3, s7 +; SICI-NEXT: v_mov_b32_e32 v0, s0 +; SICI-NEXT: v_mov_b32_e32 v1, s1 +; SICI-NEXT: v_mov_b32_e32 v2, s2 +; SICI-NEXT: v_mov_b32_e32 v3, s3 +; SICI-NEXT: ; return to shader part epilog +; +; VI-LABEL: load_v4i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s5, 0 +; VI-NEXT: s_mov_b32 s4, s1 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x20 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s0, s0, s4 +; VI-NEXT: s_add_i32 s1, s1, s5 +; VI-NEXT: s_add_i32 s2, s2, s6 +; VI-NEXT: s_add_i32 s3, s3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x20 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s0, s4, s8 +; GFX9-NEXT: s_add_i32 s1, s5, s9 +; GFX9-NEXT: s_add_i32 s2, s6, s10 +; GFX9-NEXT: s_add_i32 s3, s7, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: ; return to shader part epilog %gep1 = getelementptr inbounds <4 x i32>, ptr addrspace(6) %p1, i32 2 %r0 = load <4 x i32>, ptr addrspace(6) %p0 %r1 = load <4 x i32>, ptr addrspace(6) %gep1 @@ -61,17 +160,84 @@ define amdgpu_vs <4 x float> @load_v4i32(ptr addrspace(6) inreg %p0, ptr addrspa ret <4 x float> %r2 } -; GCN-LABEL: {{^}}load_v8i32: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 -; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10 -; VI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 -; VI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 -; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 -; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 define amdgpu_vs <8 x float> @load_v8i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 { +; SICI-LABEL: load_v8i32: +; SICI: ; %bb.0: +; SICI-NEXT: s_mov_b32 s2, s1 +; SICI-NEXT: s_mov_b32 s3, 0 +; SICI-NEXT: s_mov_b32 s1, s3 +; SICI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x10 +; SICI-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x0 +; SICI-NEXT: s_waitcnt lgkmcnt(0) +; SICI-NEXT: s_add_i32 s0, s12, s4 +; SICI-NEXT: s_add_i32 s1, s13, s5 +; SICI-NEXT: s_add_i32 s2, s14, s6 +; SICI-NEXT: s_add_i32 s3, s15, s7 +; SICI-NEXT: s_add_i32 s4, s16, s8 +; SICI-NEXT: s_add_i32 s5, s17, s9 +; SICI-NEXT: s_add_i32 s6, s18, s10 +; SICI-NEXT: s_add_i32 s7, s19, s11 +; SICI-NEXT: v_mov_b32_e32 v0, s0 +; SICI-NEXT: v_mov_b32_e32 v1, s1 +; SICI-NEXT: v_mov_b32_e32 v2, s2 +; SICI-NEXT: v_mov_b32_e32 v3, s3 +; SICI-NEXT: v_mov_b32_e32 v4, s4 +; SICI-NEXT: v_mov_b32_e32 v5, s5 +; SICI-NEXT: v_mov_b32_e32 v6, s6 +; SICI-NEXT: v_mov_b32_e32 v7, s7 +; SICI-NEXT: ; return to shader part epilog +; +; VI-LABEL: load_v8i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s2, s1 +; VI-NEXT: s_mov_b32 s3, 0 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x40 +; VI-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s0, s12, s4 +; VI-NEXT: s_add_i32 s1, s13, s5 +; VI-NEXT: s_add_i32 s2, s14, s6 +; VI-NEXT: s_add_i32 s3, s15, s7 +; VI-NEXT: s_add_i32 s4, s16, s8 +; VI-NEXT: s_add_i32 s5, s17, s9 +; VI-NEXT: s_add_i32 s6, s18, s10 +; VI-NEXT: s_add_i32 s7, s19, s11 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v7, s7 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_v8i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x40 +; GFX9-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s0, s12, s4 +; GFX9-NEXT: s_add_i32 s1, s13, s5 +; GFX9-NEXT: s_add_i32 s2, s14, s6 +; GFX9-NEXT: s_add_i32 s3, s15, s7 +; GFX9-NEXT: s_add_i32 s4, s16, s8 +; GFX9-NEXT: s_add_i32 s5, s17, s9 +; GFX9-NEXT: s_add_i32 s6, s18, s10 +; GFX9-NEXT: s_add_i32 s7, s19, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: ; return to shader part epilog %gep1 = getelementptr inbounds <8 x i32>, ptr addrspace(6) %p1, i32 2 %r0 = load <8 x i32>, ptr addrspace(6) %p0 %r1 = load <8 x i32>, ptr addrspace(6) %gep1 @@ -80,17 +246,132 @@ define amdgpu_vs <8 x float> @load_v8i32(ptr addrspace(6) inreg %p0, ptr addrspa ret <8 x float> %r2 } -; GCN-LABEL: {{^}}load_v16i32: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 -; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20 -; VI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 -; VI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 -; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 -; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 define amdgpu_vs <16 x float> @load_v16i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 { +; SICI-LABEL: load_v16i32: +; SICI: ; %bb.0: +; SICI-NEXT: s_mov_b32 s2, s1 +; SICI-NEXT: s_mov_b32 s3, 0 +; SICI-NEXT: s_mov_b32 s1, s3 +; SICI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x20 +; SICI-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0 +; SICI-NEXT: s_waitcnt lgkmcnt(0) +; SICI-NEXT: s_add_i32 s0, s36, s4 +; SICI-NEXT: s_add_i32 s1, s37, s5 +; SICI-NEXT: s_add_i32 s2, s38, s6 +; SICI-NEXT: s_add_i32 s3, s39, s7 +; SICI-NEXT: s_add_i32 s4, s40, s8 +; SICI-NEXT: s_add_i32 s5, s41, s9 +; SICI-NEXT: s_add_i32 s6, s42, s10 +; SICI-NEXT: s_add_i32 s7, s43, s11 +; SICI-NEXT: s_add_i32 s8, s44, s12 +; SICI-NEXT: s_add_i32 s9, s45, s13 +; SICI-NEXT: s_add_i32 s10, s46, s14 +; SICI-NEXT: s_add_i32 s11, s47, s15 +; SICI-NEXT: s_add_i32 s12, s48, s16 +; SICI-NEXT: s_add_i32 s13, s49, s17 +; SICI-NEXT: s_add_i32 s14, s50, s18 +; SICI-NEXT: s_add_i32 s15, s51, s19 +; SICI-NEXT: v_mov_b32_e32 v0, s0 +; SICI-NEXT: v_mov_b32_e32 v1, s1 +; SICI-NEXT: v_mov_b32_e32 v2, s2 +; SICI-NEXT: v_mov_b32_e32 v3, s3 +; SICI-NEXT: v_mov_b32_e32 v4, s4 +; SICI-NEXT: v_mov_b32_e32 v5, s5 +; SICI-NEXT: v_mov_b32_e32 v6, s6 +; SICI-NEXT: v_mov_b32_e32 v7, s7 +; SICI-NEXT: v_mov_b32_e32 v8, s8 +; SICI-NEXT: v_mov_b32_e32 v9, s9 +; SICI-NEXT: v_mov_b32_e32 v10, s10 +; SICI-NEXT: v_mov_b32_e32 v11, s11 +; SICI-NEXT: v_mov_b32_e32 v12, s12 +; SICI-NEXT: v_mov_b32_e32 v13, s13 +; SICI-NEXT: v_mov_b32_e32 v14, s14 +; SICI-NEXT: v_mov_b32_e32 v15, s15 +; SICI-NEXT: ; return to shader part epilog +; +; VI-LABEL: load_v16i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s2, s1 +; VI-NEXT: s_mov_b32 s3, 0 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x80 +; VI-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s0, s36, s4 +; VI-NEXT: s_add_i32 s1, s37, s5 +; VI-NEXT: s_add_i32 s2, s38, s6 +; VI-NEXT: s_add_i32 s3, s39, s7 +; VI-NEXT: s_add_i32 s4, s40, s8 +; VI-NEXT: s_add_i32 s5, s41, s9 +; VI-NEXT: s_add_i32 s6, s42, s10 +; VI-NEXT: s_add_i32 s7, s43, s11 +; VI-NEXT: s_add_i32 s8, s44, s12 +; VI-NEXT: s_add_i32 s9, s45, s13 +; VI-NEXT: s_add_i32 s10, s46, s14 +; VI-NEXT: s_add_i32 s11, s47, s15 +; VI-NEXT: s_add_i32 s12, s48, s16 +; VI-NEXT: s_add_i32 s13, s49, s17 +; VI-NEXT: s_add_i32 s14, s50, s18 +; VI-NEXT: s_add_i32 s15, s51, s19 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v7, s7 +; VI-NEXT: v_mov_b32_e32 v8, s8 +; VI-NEXT: v_mov_b32_e32 v9, s9 +; VI-NEXT: v_mov_b32_e32 v10, s10 +; VI-NEXT: v_mov_b32_e32 v11, s11 +; VI-NEXT: v_mov_b32_e32 v12, s12 +; VI-NEXT: v_mov_b32_e32 v13, s13 +; VI-NEXT: v_mov_b32_e32 v14, s14 +; VI-NEXT: v_mov_b32_e32 v15, s15 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_v16i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x80 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s0, s36, s4 +; GFX9-NEXT: s_add_i32 s1, s37, s5 +; GFX9-NEXT: s_add_i32 s2, s38, s6 +; GFX9-NEXT: s_add_i32 s3, s39, s7 +; GFX9-NEXT: s_add_i32 s4, s40, s8 +; GFX9-NEXT: s_add_i32 s5, s41, s9 +; GFX9-NEXT: s_add_i32 s6, s42, s10 +; GFX9-NEXT: s_add_i32 s7, s43, s11 +; GFX9-NEXT: s_add_i32 s8, s44, s12 +; GFX9-NEXT: s_add_i32 s9, s45, s13 +; GFX9-NEXT: s_add_i32 s10, s46, s14 +; GFX9-NEXT: s_add_i32 s11, s47, s15 +; GFX9-NEXT: s_add_i32 s12, s48, s16 +; GFX9-NEXT: s_add_i32 s13, s49, s17 +; GFX9-NEXT: s_add_i32 s14, s50, s18 +; GFX9-NEXT: s_add_i32 s15, s51, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-NEXT: v_mov_b32_e32 v10, s10 +; GFX9-NEXT: v_mov_b32_e32 v11, s11 +; GFX9-NEXT: v_mov_b32_e32 v12, s12 +; GFX9-NEXT: v_mov_b32_e32 v13, s13 +; GFX9-NEXT: v_mov_b32_e32 v14, s14 +; GFX9-NEXT: v_mov_b32_e32 v15, s15 +; GFX9-NEXT: ; return to shader part epilog %gep1 = getelementptr inbounds <16 x i32>, ptr addrspace(6) %p1, i32 2 %r0 = load <16 x i32>, ptr addrspace(6) %p0 %r1 = load <16 x i32>, ptr addrspace(6) %gep1 @@ -99,17 +380,42 @@ define amdgpu_vs <16 x float> @load_v16i32(ptr addrspace(6) inreg %p0, ptr addrs ret <16 x float> %r2 } -; GCN-LABEL: {{^}}load_float: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 -; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2 -; VI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 -; VI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 -; GFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 -; GFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 define amdgpu_vs float @load_float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 { +; SICI-LABEL: load_float: +; SICI: ; %bb.0: +; SICI-NEXT: s_mov_b32 s2, s1 +; SICI-NEXT: s_mov_b32 s3, 0 +; SICI-NEXT: s_mov_b32 s1, s3 +; SICI-NEXT: s_load_dword s2, s[2:3], 0x2 +; SICI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SICI-NEXT: s_waitcnt lgkmcnt(0) +; SICI-NEXT: v_mov_b32_e32 v0, s2 +; SICI-NEXT: v_add_f32_e32 v0, s0, v0 +; SICI-NEXT: ; return to shader part epilog +; +; VI-LABEL: load_float: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s2, s1 +; VI-NEXT: s_mov_b32 s3, 0 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_load_dword s2, s[2:3], 0x8 +; VI-NEXT: s_load_dword s0, s[0:1], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_add_f32_e32 v0, s0, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_float: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x8 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_add_f32_e32 v0, s5, v0 +; GFX9-NEXT: ; return to shader part epilog %gep1 = getelementptr inbounds float, ptr addrspace(6) %p1, i32 2 %r0 = load float, ptr addrspace(6) %p0 %r1 = load float, ptr addrspace(6) %gep1 @@ -117,20 +423,48 @@ define amdgpu_vs float @load_float(ptr addrspace(6) inreg %p0, ptr addrspace(6) ret float %r } -; GCN-LABEL: {{^}}load_v2float: -; SICIVI-DAG: s_mov_b32 s3, 0 -; SICIVI-DAG: s_mov_b32 s2, s1 -; SICIVI-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4 -; VI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; VI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 -; GFX9-DAG: s_mov_b32 s2, s1 -; GFX9-DAG: s_mov_b32 s3, 0 -; GFX9-DAG: s_mov_b32 s1, s3 -; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 define amdgpu_vs <2 x float> @load_v2float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 { +; SICI-LABEL: load_v2float: +; SICI: ; %bb.0: +; SICI-NEXT: s_mov_b32 s2, s1 +; SICI-NEXT: s_mov_b32 s3, 0 +; SICI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4 +; SICI-NEXT: s_mov_b32 s1, s3 +; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; SICI-NEXT: s_waitcnt lgkmcnt(0) +; SICI-NEXT: v_mov_b32_e32 v0, s4 +; SICI-NEXT: v_mov_b32_e32 v1, s5 +; SICI-NEXT: v_add_f32_e32 v0, s0, v0 +; SICI-NEXT: v_add_f32_e32 v1, s1, v1 +; SICI-NEXT: ; return to shader part epilog +; +; VI-LABEL: load_v2float: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s2, s1 +; VI-NEXT: s_mov_b32 s3, 0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x10 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_f32_e32 v0, s0, v0 +; VI-NEXT: v_add_f32_e32 v1, s1, v1 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_v2float: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x10 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX9-NEXT: v_add_f32_e32 v1, s1, v1 +; GFX9-NEXT: ; return to shader part epilog %gep1 = getelementptr inbounds <2 x float>, ptr addrspace(6) %p1, i32 2 %r0 = load <2 x float>, ptr addrspace(6) %p0 %r1 = load <2 x float>, ptr addrspace(6) %gep1 @@ -138,17 +472,60 @@ define amdgpu_vs <2 x float> @load_v2float(ptr addrspace(6) inreg %p0, ptr addrs ret <2 x float> %r } -; GCN-LABEL: {{^}}load_v4float: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 -; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8 -; VI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 -; VI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 -; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 -; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 define amdgpu_vs <4 x float> @load_v4float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 { +; SICI-LABEL: load_v4float: +; SICI: ; %bb.0: +; SICI-NEXT: s_mov_b32 s2, s1 +; SICI-NEXT: s_mov_b32 s3, 0 +; SICI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 +; SICI-NEXT: s_mov_b32 s1, s3 +; SICI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SICI-NEXT: s_waitcnt lgkmcnt(0) +; SICI-NEXT: v_mov_b32_e32 v0, s4 +; SICI-NEXT: v_mov_b32_e32 v1, s5 +; SICI-NEXT: v_mov_b32_e32 v2, s6 +; SICI-NEXT: v_mov_b32_e32 v3, s7 +; SICI-NEXT: v_add_f32_e32 v0, s0, v0 +; SICI-NEXT: v_add_f32_e32 v1, s1, v1 +; SICI-NEXT: v_add_f32_e32 v2, s2, v2 +; SICI-NEXT: v_add_f32_e32 v3, s3, v3 +; SICI-NEXT: ; return to shader part epilog +; +; VI-LABEL: load_v4float: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s2, s1 +; VI-NEXT: s_mov_b32 s3, 0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x20 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_add_f32_e32 v0, s0, v0 +; VI-NEXT: v_add_f32_e32 v1, s1, v1 +; VI-NEXT: v_add_f32_e32 v2, s2, v2 +; VI-NEXT: v_add_f32_e32 v3, s3, v3 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_v4float: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x20 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX9-NEXT: v_add_f32_e32 v1, s1, v1 +; GFX9-NEXT: v_add_f32_e32 v2, s2, v2 +; GFX9-NEXT: v_add_f32_e32 v3, s3, v3 +; GFX9-NEXT: ; return to shader part epilog %gep1 = getelementptr inbounds <4 x float>, ptr addrspace(6) %p1, i32 2 %r0 = load <4 x float>, ptr addrspace(6) %p0 %r1 = load <4 x float>, ptr addrspace(6) %gep1 @@ -156,17 +533,84 @@ define amdgpu_vs <4 x float> @load_v4float(ptr addrspace(6) inreg %p0, ptr addrs ret <4 x float> %r } -; GCN-LABEL: {{^}}load_v8float: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 -; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10 -; VI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 -; VI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 -; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 -; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 define amdgpu_vs <8 x float> @load_v8float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 { +; SICI-LABEL: load_v8float: +; SICI: ; %bb.0: +; SICI-NEXT: s_mov_b32 s2, s1 +; SICI-NEXT: s_mov_b32 s3, 0 +; SICI-NEXT: s_mov_b32 s1, s3 +; SICI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x10 +; SICI-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x0 +; SICI-NEXT: s_waitcnt lgkmcnt(0) +; SICI-NEXT: v_mov_b32_e32 v0, s4 +; SICI-NEXT: v_mov_b32_e32 v1, s5 +; SICI-NEXT: v_mov_b32_e32 v2, s6 +; SICI-NEXT: v_mov_b32_e32 v3, s7 +; SICI-NEXT: v_mov_b32_e32 v4, s8 +; SICI-NEXT: v_mov_b32_e32 v5, s9 +; SICI-NEXT: v_mov_b32_e32 v6, s10 +; SICI-NEXT: v_mov_b32_e32 v7, s11 +; SICI-NEXT: v_add_f32_e32 v0, s12, v0 +; SICI-NEXT: v_add_f32_e32 v1, s13, v1 +; SICI-NEXT: v_add_f32_e32 v2, s14, v2 +; SICI-NEXT: v_add_f32_e32 v3, s15, v3 +; SICI-NEXT: v_add_f32_e32 v4, s16, v4 +; SICI-NEXT: v_add_f32_e32 v5, s17, v5 +; SICI-NEXT: v_add_f32_e32 v6, s18, v6 +; SICI-NEXT: v_add_f32_e32 v7, s19, v7 +; SICI-NEXT: ; return to shader part epilog +; +; VI-LABEL: load_v8float: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s2, s1 +; VI-NEXT: s_mov_b32 s3, 0 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x40 +; VI-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v4, s8 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_mov_b32_e32 v6, s10 +; VI-NEXT: v_mov_b32_e32 v7, s11 +; VI-NEXT: v_add_f32_e32 v0, s12, v0 +; VI-NEXT: v_add_f32_e32 v1, s13, v1 +; VI-NEXT: v_add_f32_e32 v2, s14, v2 +; VI-NEXT: v_add_f32_e32 v3, s15, v3 +; VI-NEXT: v_add_f32_e32 v4, s16, v4 +; VI-NEXT: v_add_f32_e32 v5, s17, v5 +; VI-NEXT: v_add_f32_e32 v6, s18, v6 +; VI-NEXT: v_add_f32_e32 v7, s19, v7 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_v8float: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x40 +; GFX9-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_add_f32_e32 v0, s12, v0 +; GFX9-NEXT: v_add_f32_e32 v1, s13, v1 +; GFX9-NEXT: v_add_f32_e32 v2, s14, v2 +; GFX9-NEXT: v_add_f32_e32 v3, s15, v3 +; GFX9-NEXT: v_add_f32_e32 v4, s16, v4 +; GFX9-NEXT: v_add_f32_e32 v5, s17, v5 +; GFX9-NEXT: v_add_f32_e32 v6, s18, v6 +; GFX9-NEXT: v_add_f32_e32 v7, s19, v7 +; GFX9-NEXT: ; return to shader part epilog %gep1 = getelementptr inbounds <8 x float>, ptr addrspace(6) %p1, i32 2 %r0 = load <8 x float>, ptr addrspace(6) %p0 %r1 = load <8 x float>, ptr addrspace(6) %gep1 @@ -174,17 +618,132 @@ define amdgpu_vs <8 x float> @load_v8float(ptr addrspace(6) inreg %p0, ptr addrs ret <8 x float> %r } -; GCN-LABEL: {{^}}load_v16float: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 -; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20 -; VI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 -; VI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 -; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 -; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 define amdgpu_vs <16 x float> @load_v16float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 { +; SICI-LABEL: load_v16float: +; SICI: ; %bb.0: +; SICI-NEXT: s_mov_b32 s3, 0 +; SICI-NEXT: s_mov_b32 s2, s1 +; SICI-NEXT: s_mov_b32 s1, s3 +; SICI-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x20 +; SICI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x0 +; SICI-NEXT: s_waitcnt lgkmcnt(0) +; SICI-NEXT: v_mov_b32_e32 v0, s16 +; SICI-NEXT: v_mov_b32_e32 v1, s17 +; SICI-NEXT: v_mov_b32_e32 v2, s18 +; SICI-NEXT: v_mov_b32_e32 v3, s19 +; SICI-NEXT: v_mov_b32_e32 v4, s20 +; SICI-NEXT: v_mov_b32_e32 v5, s21 +; SICI-NEXT: v_mov_b32_e32 v6, s22 +; SICI-NEXT: v_mov_b32_e32 v7, s23 +; SICI-NEXT: v_mov_b32_e32 v8, s24 +; SICI-NEXT: v_mov_b32_e32 v9, s25 +; SICI-NEXT: v_mov_b32_e32 v10, s26 +; SICI-NEXT: v_mov_b32_e32 v11, s27 +; SICI-NEXT: v_mov_b32_e32 v12, s28 +; SICI-NEXT: v_mov_b32_e32 v13, s29 +; SICI-NEXT: v_mov_b32_e32 v14, s30 +; SICI-NEXT: v_mov_b32_e32 v15, s31 +; SICI-NEXT: v_add_f32_e32 v0, s0, v0 +; SICI-NEXT: v_add_f32_e32 v1, s1, v1 +; SICI-NEXT: v_add_f32_e32 v2, s2, v2 +; SICI-NEXT: v_add_f32_e32 v3, s3, v3 +; SICI-NEXT: v_add_f32_e32 v4, s4, v4 +; SICI-NEXT: v_add_f32_e32 v5, s5, v5 +; SICI-NEXT: v_add_f32_e32 v6, s6, v6 +; SICI-NEXT: v_add_f32_e32 v7, s7, v7 +; SICI-NEXT: v_add_f32_e32 v8, s8, v8 +; SICI-NEXT: v_add_f32_e32 v9, s9, v9 +; SICI-NEXT: v_add_f32_e32 v10, s10, v10 +; SICI-NEXT: v_add_f32_e32 v11, s11, v11 +; SICI-NEXT: v_add_f32_e32 v12, s12, v12 +; SICI-NEXT: v_add_f32_e32 v13, s13, v13 +; SICI-NEXT: v_add_f32_e32 v14, s14, v14 +; SICI-NEXT: v_add_f32_e32 v15, s15, v15 +; SICI-NEXT: ; return to shader part epilog +; +; VI-LABEL: load_v16float: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s3, 0 +; VI-NEXT: s_mov_b32 s2, s1 +; VI-NEXT: s_mov_b32 s1, s3 +; VI-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x80 +; VI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: v_add_f32_e32 v0, s0, v0 +; VI-NEXT: v_add_f32_e32 v1, s1, v1 +; VI-NEXT: v_add_f32_e32 v2, s2, v2 +; VI-NEXT: v_add_f32_e32 v3, s3, v3 +; VI-NEXT: v_add_f32_e32 v4, s4, v4 +; VI-NEXT: v_add_f32_e32 v5, s5, v5 +; VI-NEXT: v_add_f32_e32 v6, s6, v6 +; VI-NEXT: v_add_f32_e32 v7, s7, v7 +; VI-NEXT: v_add_f32_e32 v8, s8, v8 +; VI-NEXT: v_add_f32_e32 v9, s9, v9 +; VI-NEXT: v_add_f32_e32 v10, s10, v10 +; VI-NEXT: v_add_f32_e32 v11, s11, v11 +; VI-NEXT: v_add_f32_e32 v12, s12, v12 +; VI-NEXT: v_add_f32_e32 v13, s13, v13 +; VI-NEXT: v_add_f32_e32 v14, s14, v14 +; VI-NEXT: v_add_f32_e32 v15, s15, v15 +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_v16float: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s2, s1 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x80 +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s36 +; GFX9-NEXT: v_mov_b32_e32 v1, s37 +; GFX9-NEXT: v_mov_b32_e32 v2, s38 +; GFX9-NEXT: v_mov_b32_e32 v3, s39 +; GFX9-NEXT: v_mov_b32_e32 v4, s40 +; GFX9-NEXT: v_mov_b32_e32 v5, s41 +; GFX9-NEXT: v_mov_b32_e32 v6, s42 +; GFX9-NEXT: v_mov_b32_e32 v7, s43 +; GFX9-NEXT: v_mov_b32_e32 v8, s44 +; GFX9-NEXT: v_mov_b32_e32 v9, s45 +; GFX9-NEXT: v_mov_b32_e32 v10, s46 +; GFX9-NEXT: v_mov_b32_e32 v11, s47 +; GFX9-NEXT: v_mov_b32_e32 v12, s48 +; GFX9-NEXT: v_mov_b32_e32 v13, s49 +; GFX9-NEXT: v_mov_b32_e32 v14, s50 +; GFX9-NEXT: v_mov_b32_e32 v15, s51 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_add_f32_e32 v1, s5, v1 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v2 +; GFX9-NEXT: v_add_f32_e32 v3, s7, v3 +; GFX9-NEXT: v_add_f32_e32 v4, s8, v4 +; GFX9-NEXT: v_add_f32_e32 v5, s9, v5 +; GFX9-NEXT: v_add_f32_e32 v6, s10, v6 +; GFX9-NEXT: v_add_f32_e32 v7, s11, v7 +; GFX9-NEXT: v_add_f32_e32 v8, s12, v8 +; GFX9-NEXT: v_add_f32_e32 v9, s13, v9 +; GFX9-NEXT: v_add_f32_e32 v10, s14, v10 +; GFX9-NEXT: v_add_f32_e32 v11, s15, v11 +; GFX9-NEXT: v_add_f32_e32 v12, s16, v12 +; GFX9-NEXT: v_add_f32_e32 v13, s17, v13 +; GFX9-NEXT: v_add_f32_e32 v14, s18, v14 +; GFX9-NEXT: v_add_f32_e32 v15, s19, v15 +; GFX9-NEXT: ; return to shader part epilog %gep1 = getelementptr inbounds <16 x float>, ptr addrspace(6) %p1, i32 2 %r0 = load <16 x float>, ptr addrspace(6) %p0 %r1 = load <16 x float>, ptr addrspace(6) %gep1 @@ -192,45 +751,107 @@ define amdgpu_vs <16 x float> @load_v16float(ptr addrspace(6) inreg %p0, ptr add ret <16 x float> %r } -; GCN-LABEL: {{^}}load_i32_hi0: -; GCN: s_mov_b32 s1, 0 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 define amdgpu_vs i32 @load_i32_hi0(ptr addrspace(6) inreg %p) #1 { +; GCN-LABEL: load_i32_hi0: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s1, 0 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog %r0 = load i32, ptr addrspace(6) %p ret i32 %r0 } -; GCN-LABEL: {{^}}load_i32_hi1: -; GCN: s_mov_b32 s1, 1 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 define amdgpu_vs i32 @load_i32_hi1(ptr addrspace(6) inreg %p) #2 { +; GCN-LABEL: load_i32_hi1: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s1, 1 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog %r0 = load i32, ptr addrspace(6) %p ret i32 %r0 } -; GCN-LABEL: {{^}}load_i32_hiffff8000: -; GCN: s_movk_i32 s1, 0x8000 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 define amdgpu_vs i32 @load_i32_hiffff8000(ptr addrspace(6) inreg %p) #3 { +; GCN-LABEL: load_i32_hiffff8000: +; GCN: ; %bb.0: +; GCN-NEXT: s_movk_i32 s1, 0x8000 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog %r0 = load i32, ptr addrspace(6) %p ret i32 %r0 } -; GCN-LABEL: {{^}}load_i32_hifffffff0: -; GCN: s_mov_b32 s1, -16 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 define amdgpu_vs i32 @load_i32_hifffffff0(ptr addrspace(6) inreg %p) #4 { +; GCN-LABEL: load_i32_hifffffff0: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s1, -16 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog %r0 = load i32, ptr addrspace(6) %p ret i32 %r0 } -; GCN-LABEL: {{^}}load_sampler -; GCN: v_readfirstlane_b32 -; SI: s_nop -; GCN: s_load_dwordx8 -; GCN-NEXT: s_load_dwordx4 -; GCN: image_sample define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @load_sampler(ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 { +; SI-LABEL: load_sampler: +; SI: ; %bb.0: ; %main_body +; SI-NEXT: s_mov_b64 s[6:7], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: s_mov_b32 m0, s5 +; SI-NEXT: v_interp_mov_f32 v0, p0, attr0.x +; SI-NEXT: v_lshlrev_b32_e32 v0, 6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s1, v0 +; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_mov_b32 s1, 0 +; SI-NEXT: s_nop 2 +; SI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xc +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: image_sample v[0:3], v0, s[8:15], s[0:3] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: load_sampler: +; VI: ; %bb.0: ; %main_body +; VI-NEXT: s_mov_b64 s[6:7], exec +; VI-NEXT: s_wqm_b64 exec, exec +; VI-NEXT: s_mov_b32 m0, s5 +; VI-NEXT: v_interp_mov_f32_e32 v0, p0, attr0.x +; VI-NEXT: v_lshlrev_b32_e32 v0, 6, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s1, v0 +; VI-NEXT: v_readfirstlane_b32 s0, v0 +; VI-NEXT: s_mov_b32 s1, 0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x30 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_and_b64 exec, exec, s[6:7] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: image_sample v[0:3], v0, s[8:15], s[0:3] dmask:0xf +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_sampler: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: s_mov_b32 s17, 0 +; GFX9-NEXT: v_interp_mov_f32_e32 v0, p0, attr0.x +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 6, s1 +; GFX9-NEXT: v_readfirstlane_b32 s16, v0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[16:17], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[16:17], 0x30 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: image_sample v[0:3], v0, s[8:15], s[0:3] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog main_body: %22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8 %23 = bitcast float %22 to i32 @@ -256,13 +877,63 @@ main_body: ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42 } -; GCN-LABEL: {{^}}load_sampler_nouniform -; GCN: v_readfirstlane_b32 -; SI: s_nop -; GCN: s_load_dwordx8 -; GCN-NEXT: s_load_dwordx4 -; GCN: image_sample define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @load_sampler_nouniform(ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 { +; SI-LABEL: load_sampler_nouniform: +; SI: ; %bb.0: ; %main_body +; SI-NEXT: s_mov_b64 s[6:7], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: s_mov_b32 m0, s5 +; SI-NEXT: v_interp_mov_f32 v0, p0, attr0.x +; SI-NEXT: v_lshlrev_b32_e32 v0, 6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s1, v0 +; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_mov_b32 s1, 0 +; SI-NEXT: s_nop 2 +; SI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xc +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: image_sample v[0:3], v0, s[8:15], s[0:3] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: load_sampler_nouniform: +; VI: ; %bb.0: ; %main_body +; VI-NEXT: s_mov_b64 s[6:7], exec +; VI-NEXT: s_wqm_b64 exec, exec +; VI-NEXT: s_mov_b32 m0, s5 +; VI-NEXT: v_interp_mov_f32_e32 v0, p0, attr0.x +; VI-NEXT: v_lshlrev_b32_e32 v0, 6, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s1, v0 +; VI-NEXT: v_readfirstlane_b32 s0, v0 +; VI-NEXT: s_mov_b32 s1, 0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x30 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_and_b64 exec, exec, s[6:7] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: image_sample v[0:3], v0, s[8:15], s[0:3] dmask:0xf +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_sampler_nouniform: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: s_mov_b32 m0, s5 +; GFX9-NEXT: s_mov_b32 s17, 0 +; GFX9-NEXT: v_interp_mov_f32_e32 v0, p0, attr0.x +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 6, s1 +; GFX9-NEXT: v_readfirstlane_b32 s16, v0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[16:17], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[16:17], 0x30 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: image_sample v[0:3], v0, s[8:15], s[0:3] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog main_body: %22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8 %23 = bitcast float %22 to i32 @@ -288,22 +959,54 @@ main_body: ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42 } -; GCN-LABEL: {{^}}load_addr_no_fold: -; GCN-DAG: s_add_i32 s0, s0, 4 -; GCN-DAG: s_mov_b32 s1, 0 -; GCN: s_load_dword s{{[0-9]}}, s[0:1], 0x0 define amdgpu_vs float @load_addr_no_fold(ptr addrspace(6) inreg noalias %p0) #0 { +; GCN-LABEL: load_addr_no_fold: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s0, s0, 4 +; GCN-NEXT: s_mov_b32 s1, 0 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog %gep1 = getelementptr i32, ptr addrspace(6) %p0, i32 1 %r1 = load i32, ptr addrspace(6) %gep1 %r2 = bitcast i32 %r1 to float ret float %r2 } -; GCN-LABEL: {{^}}vgpr_arg_src: -; GCN: v_readfirstlane_b32 s[[READLANE:[0-9]+]], v0 -; GCN: s_mov_b32 s[[ZERO:[0-9]+]] -; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[[[READLANE]]:[[ZERO]]] define amdgpu_vs float @vgpr_arg_src(ptr addrspace(6) %arg) { +; SI-LABEL: vgpr_arg_src: +; SI: ; %bb.0: ; %main_body +; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_mov_b32 s1, 0 +; SI-NEXT: s_nop 2 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: vgpr_arg_src: +; VI: ; %bb.0: ; %main_body +; VI-NEXT: v_readfirstlane_b32 s0, v0 +; VI-NEXT: s_mov_b32 s1, 0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_nop 1 +; VI-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: vgpr_arg_src: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog main_body: %tmp9 = load ptr addrspace(8), ptr addrspace(6) %arg %tmp10 = call nsz float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %tmp9, i32 poison, i32 0, i32 0, i32 0) #1 @@ -329,3 +1032,5 @@ attributes #5 = { "InitialPSInputAddr"="45175" } attributes #6 = { nounwind readnone speculatable } attributes #7 = { nounwind memory(argmem: read) } attributes #8 = { nounwind readnone } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; SICIVI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll index ac9a279491668..0a9edbb311ace 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll @@ -10,18 +10,19 @@ define protected amdgpu_kernel void @sccClobber(ptr addrspace(1) %a, ptr addrspa ; RRLIST-NEXT: v_mov_b32_e32 v2, 0 ; RRLIST-NEXT: s_waitcnt lgkmcnt(0) ; RRLIST-NEXT: s_load_dword s16, s[12:13], 0x0 -; RRLIST-NEXT: s_load_dwordx2 s[0:1], s[10:11], 0x0 -; RRLIST-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; RRLIST-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; RRLIST-NEXT: s_load_dwordx2 s[2:3], s[10:11], 0x0 ; RRLIST-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x44 -; RRLIST-NEXT: s_load_dword s17, s[14:15], 0x0 +; RRLIST-NEXT: s_nop 0 +; RRLIST-NEXT: s_load_dword s8, s[14:15], 0x0 ; RRLIST-NEXT: s_waitcnt lgkmcnt(0) -; RRLIST-NEXT: s_min_i32 s8, s16, 0 -; RRLIST-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; RRLIST-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; RRLIST-NEXT: s_min_i32 s9, s16, 0 +; RRLIST-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; RRLIST-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; RRLIST-NEXT: s_and_b64 s[4:5], vcc, exec -; RRLIST-NEXT: s_cselect_b32 s4, s16, s17 -; RRLIST-NEXT: s_cmp_eq_u64 s[2:3], s[0:1] -; RRLIST-NEXT: s_cselect_b32 s0, s8, s4 +; RRLIST-NEXT: s_cselect_b32 s4, s16, s8 +; RRLIST-NEXT: s_cmp_eq_u64 s[0:1], s[2:3] +; RRLIST-NEXT: s_cselect_b32 s0, s9, s4 ; RRLIST-NEXT: v_mov_b32_e32 v0, s0 ; RRLIST-NEXT: global_store_dword v2, v0, s[6:7] ; RRLIST-NEXT: s_endpgm @@ -29,16 +30,16 @@ define protected amdgpu_kernel void @sccClobber(ptr addrspace(1) %a, ptr addrspa ; FAST-LABEL: sccClobber: ; FAST: ; %bb.0: ; %entry ; FAST-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; FAST-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x44 ; FAST-NEXT: v_mov_b32_e32 v2, 0 ; FAST-NEXT: s_waitcnt lgkmcnt(0) -; FAST-NEXT: s_load_dword s16, s[12:13], 0x0 ; FAST-NEXT: s_load_dwordx2 s[0:1], s[10:11], 0x0 -; FAST-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 -; FAST-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x44 +; FAST-NEXT: s_load_dword s16, s[12:13], 0x0 ; FAST-NEXT: s_load_dword s17, s[14:15], 0x0 +; FAST-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; FAST-NEXT: s_waitcnt lgkmcnt(0) -; FAST-NEXT: s_min_i32 s8, s16, 0 ; FAST-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; FAST-NEXT: s_min_i32 s8, s16, 0 ; FAST-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; FAST-NEXT: s_and_b64 s[4:5], vcc, exec ; FAST-NEXT: s_cselect_b32 s4, s16, s17 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index fb418afb8b039..0ac6d5bffc218 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -1205,15 +1205,15 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %ou ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ushort v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: flat_load_ushort v1, v[2:3] ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bcnt_u32_b32 v0, v3, v0 +; VI-NEXT: v_bcnt_u32_b32 v0, v0, v1 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -1292,7 +1292,7 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB14_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_branch .LBB14_2 ; ; VI-LABEL: ctpop_i16_in_br: diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index d1090738e24a6..e81eb018e6a3e 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -2974,15 +2974,15 @@ define amdgpu_kernel void @cvt_f32_ubyte0_vector() local_unnamed_addr { ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s4, s[0:1], 0x0 ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:3 ; SI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:2 ; SI-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:1 ; SI-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_fma_f32 v0, s0, v0, 0.5 +; SI-NEXT: v_fma_f32 v0, s4, v0, 0.5 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 @@ -2999,15 +2999,15 @@ define amdgpu_kernel void @cvt_f32_ubyte0_vector() local_unnamed_addr { ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s4, s[0:1], 0x0 ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:3 ; VI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:2 ; VI-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:1 ; VI-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 -; VI-NEXT: s_load_dword s0, s[0:1], 0x0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-NEXT: v_mul_f32_e32 v0, s4, v0 ; VI-NEXT: v_add_f32_e32 v0, 0.5, v0 ; VI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -3024,17 +3024,16 @@ define amdgpu_kernel void @cvt_f32_ubyte0_vector() local_unnamed_addr { ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_clause 0x3 ; GFX10-NEXT: global_load_ubyte v1, v0, s[0:1] offset:3 ; GFX10-NEXT: global_load_ubyte v2, v0, s[0:1] offset:2 ; GFX10-NEXT: global_load_ubyte v3, v0, s[0:1] offset:1 ; GFX10-NEXT: global_load_ubyte v4, v0, s[0:1] -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt vmcnt(3) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_fma_f32 v0, s0, v0, 0.5 +; GFX10-NEXT: v_fma_f32 v0, s2, v0, 0.5 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: global_store_byte v[0:1], v2, off @@ -3073,17 +3072,17 @@ define amdgpu_kernel void @cvt_f32_ubyte0_vector() local_unnamed_addr { ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_u8 v1, v0, s[0:1] offset:3 ; GFX11-NEXT: global_load_u8 v2, v0, s[0:1] offset:2 ; GFX11-NEXT: global_load_u8 v3, v0, s[0:1] offset:1 ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fma_f32 v1, s0, v1, 0.5 +; GFX11-NEXT: v_fma_f32 v1, s2, v1, 0.5 ; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_clause 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index 45fe2d07226a1..5ce299262805b 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -300,6 +300,7 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -595,6 +596,7 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index 06c30dfd36033..271160ba73652 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -1235,19 +1235,19 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving( ; CI-NEXT: v_add_i32_e32 v3, vcc, s1, v0 ; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v1 ; CI-NEXT: v_add_i32_e32 v6, vcc, s3, v0 -; CI-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 -; CI-NEXT: ds_read2_b32 v[2:3], v3 offset1:4 +; CI-NEXT: ds_read2_b32 v[0:1], v3 offset1:4 +; CI-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 ; CI-NEXT: ds_read2_b32 v[4:5], v4 offset1:1 ; CI-NEXT: ds_read2_b32 v[6:7], v6 offset1:4 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(2) -; CI-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-NEXT: v_mul_f32_e32 v0, v2, v0 ; CI-NEXT: v_add_f32_e32 v0, 2.0, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mul_f32_e32 v2, v4, v6 ; CI-NEXT: v_sub_f32_e32 v0, v0, v2 -; CI-NEXT: v_mul_f32_e32 v1, v1, v3 +; CI-NEXT: v_mul_f32_e32 v1, v3, v1 ; CI-NEXT: v_sub_f32_e32 v0, v0, v1 ; CI-NEXT: v_mul_f32_e32 v1, v5, v7 ; CI-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1265,17 +1265,17 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving( ; GFX9-NEXT: v_add_u32_e32 v3, s1, v0 ; GFX9-NEXT: v_add_u32_e32 v4, s2, v1 ; GFX9-NEXT: v_add_u32_e32 v6, s3, v0 -; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 -; GFX9-NEXT: ds_read2_b32 v[2:3], v3 offset1:4 +; GFX9-NEXT: ds_read2_b32 v[0:1], v3 offset1:4 +; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[4:5], v4 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[6:7], v6 offset1:4 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_mul_f32_e32 v0, v2, v0 ; GFX9-NEXT: v_add_f32_e32 v0, 2.0, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, v4, v6 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll index 1d83d33a4f832..b64324ab1d4b2 100644 --- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -2521,6 +2521,7 @@ define amdgpu_kernel void @fcmp_v2f16_lt( ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -2551,6 +2552,7 @@ define amdgpu_kernel void @fcmp_v2f16_lt( ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -2581,6 +2583,7 @@ define amdgpu_kernel void @fcmp_v2f16_lt( ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX12-TRUE16-NEXT: s_clause 0x1 ; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -2612,6 +2615,7 @@ define amdgpu_kernel void @fcmp_v2f16_lt( ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX12-FAKE16-NEXT: s_clause 0x1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -2716,6 +2720,7 @@ define amdgpu_kernel void @fcmp_v2f16_eq( ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -2746,6 +2751,7 @@ define amdgpu_kernel void @fcmp_v2f16_eq( ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -2776,6 +2782,7 @@ define amdgpu_kernel void @fcmp_v2f16_eq( ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX12-TRUE16-NEXT: s_clause 0x1 ; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -2807,6 +2814,7 @@ define amdgpu_kernel void @fcmp_v2f16_eq( ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX12-FAKE16-NEXT: s_clause 0x1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -2910,6 +2918,7 @@ define amdgpu_kernel void @fcmp_v2f16_le( ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -2940,6 +2949,7 @@ define amdgpu_kernel void @fcmp_v2f16_le( ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -2970,6 +2980,7 @@ define amdgpu_kernel void @fcmp_v2f16_le( ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX12-TRUE16-NEXT: s_clause 0x1 ; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -3001,6 +3012,7 @@ define amdgpu_kernel void @fcmp_v2f16_le( ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX12-FAKE16-NEXT: s_clause 0x1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -3104,6 +3116,7 @@ define amdgpu_kernel void @fcmp_v2f16_gt( ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -3134,6 +3147,7 @@ define amdgpu_kernel void @fcmp_v2f16_gt( ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -3164,6 +3178,7 @@ define amdgpu_kernel void @fcmp_v2f16_gt( ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX12-TRUE16-NEXT: s_clause 0x1 ; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -3195,6 +3210,7 @@ define amdgpu_kernel void @fcmp_v2f16_gt( ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX12-FAKE16-NEXT: s_clause 0x1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -3299,6 +3315,7 @@ define amdgpu_kernel void @fcmp_v2f16_lg( ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -3329,6 +3346,7 @@ define amdgpu_kernel void @fcmp_v2f16_lg( ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -3359,6 +3377,7 @@ define amdgpu_kernel void @fcmp_v2f16_lg( ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX12-TRUE16-NEXT: s_clause 0x1 ; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -3390,6 +3409,7 @@ define amdgpu_kernel void @fcmp_v2f16_lg( ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX12-FAKE16-NEXT: s_clause 0x1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -3494,6 +3514,7 @@ define amdgpu_kernel void @fcmp_v2f16_ge( ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -3524,6 +3545,7 @@ define amdgpu_kernel void @fcmp_v2f16_ge( ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -3554,6 +3576,7 @@ define amdgpu_kernel void @fcmp_v2f16_ge( ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX12-TRUE16-NEXT: s_clause 0x1 ; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -3585,6 +3608,7 @@ define amdgpu_kernel void @fcmp_v2f16_ge( ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX12-FAKE16-NEXT: s_clause 0x1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -3689,6 +3713,7 @@ define amdgpu_kernel void @fcmp_v2f16_o( ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -3719,6 +3744,7 @@ define amdgpu_kernel void @fcmp_v2f16_o( ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -3749,6 +3775,7 @@ define amdgpu_kernel void @fcmp_v2f16_o( ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX12-TRUE16-NEXT: s_clause 0x1 ; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -3780,6 +3807,7 @@ define amdgpu_kernel void @fcmp_v2f16_o( ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX12-FAKE16-NEXT: s_clause 0x1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -3884,6 +3912,7 @@ define amdgpu_kernel void @fcmp_v2f16_u( ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -3914,6 +3943,7 @@ define amdgpu_kernel void @fcmp_v2f16_u( ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -3944,6 +3974,7 @@ define amdgpu_kernel void @fcmp_v2f16_u( ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX12-TRUE16-NEXT: s_clause 0x1 ; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -3975,6 +4006,7 @@ define amdgpu_kernel void @fcmp_v2f16_u( ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX12-FAKE16-NEXT: s_clause 0x1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -4078,6 +4110,7 @@ define amdgpu_kernel void @fcmp_v2f16_nge( ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -4108,6 +4141,7 @@ define amdgpu_kernel void @fcmp_v2f16_nge( ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -4138,6 +4172,7 @@ define amdgpu_kernel void @fcmp_v2f16_nge( ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX12-TRUE16-NEXT: s_clause 0x1 ; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -4169,6 +4204,7 @@ define amdgpu_kernel void @fcmp_v2f16_nge( ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX12-FAKE16-NEXT: s_clause 0x1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -4272,6 +4308,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlg( ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -4302,6 +4339,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlg( ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -4332,6 +4370,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlg( ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX12-TRUE16-NEXT: s_clause 0x1 ; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -4363,6 +4402,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlg( ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX12-FAKE16-NEXT: s_clause 0x1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -4467,6 +4507,7 @@ define amdgpu_kernel void @fcmp_v2f16_ngt( ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -4497,6 +4538,7 @@ define amdgpu_kernel void @fcmp_v2f16_ngt( ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -4527,6 +4569,7 @@ define amdgpu_kernel void @fcmp_v2f16_ngt( ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX12-TRUE16-NEXT: s_clause 0x1 ; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -4558,6 +4601,7 @@ define amdgpu_kernel void @fcmp_v2f16_ngt( ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX12-FAKE16-NEXT: s_clause 0x1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -4661,6 +4705,7 @@ define amdgpu_kernel void @fcmp_v2f16_nle( ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -4691,6 +4736,7 @@ define amdgpu_kernel void @fcmp_v2f16_nle( ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -4721,6 +4767,7 @@ define amdgpu_kernel void @fcmp_v2f16_nle( ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX12-TRUE16-NEXT: s_clause 0x1 ; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -4752,6 +4799,7 @@ define amdgpu_kernel void @fcmp_v2f16_nle( ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX12-FAKE16-NEXT: s_clause 0x1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -4855,6 +4903,7 @@ define amdgpu_kernel void @fcmp_v2f16_neq( ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -4885,6 +4934,7 @@ define amdgpu_kernel void @fcmp_v2f16_neq( ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -4915,6 +4965,7 @@ define amdgpu_kernel void @fcmp_v2f16_neq( ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX12-TRUE16-NEXT: s_clause 0x1 ; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -4946,6 +4997,7 @@ define amdgpu_kernel void @fcmp_v2f16_neq( ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX12-FAKE16-NEXT: s_clause 0x1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -5049,6 +5101,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlt( ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -5079,6 +5132,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlt( ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -5109,6 +5163,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlt( ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX12-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX12-TRUE16-NEXT: s_clause 0x1 ; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -5140,6 +5195,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlt( ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX12-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX12-FAKE16-NEXT: s_clause 0x1 ; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], null ; GFX12-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], null ; GFX12-FAKE16-NEXT: s_mov_b32 s8, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index b4b9c2d3e0135..e8ff1b34f305b 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -962,8 +962,9 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] -; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7] +; GFX9-NEXT: s_brev_b32 s2, -2 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -976,13 +977,15 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0x3ff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] -; GFX11-TRUE16-NEXT: global_load_b32 v1, v1, s[4:5] +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] +; GFX11-TRUE16-NEXT: global_load_b32 v1, v2, s[4:5] +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -1001,6 +1004,7 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_u16 v1, v1, s[2:3] ; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[4:5] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -1101,6 +1105,7 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 1, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] ; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v1, s[4:5] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -1121,6 +1126,7 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_u16 v2, v1, s[2:3] ; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v0, s[4:5] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -1202,10 +1208,11 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v1, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -1221,6 +1228,7 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[6:7] ; GFX11-TRUE16-NEXT: global_load_b32 v1, v1, s[2:3] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -1241,6 +1249,7 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_u16 v1, v1, s[6:7] ; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -1342,6 +1351,7 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 1, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[6:7] ; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v1, s[2:3] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -1362,6 +1372,7 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_u16 v2, v1, s[6:7] ; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -1444,10 +1455,11 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v1, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] ; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 ; GFX9-NEXT: global_store_short v2, v0, s[0:1] @@ -1463,6 +1475,7 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v1, v1, s[6:7] ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -1484,6 +1497,7 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v1, v1, s[6:7] ; GFX11-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -1582,6 +1596,7 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -1602,6 +1617,7 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[2:3] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -1686,8 +1702,9 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v1, s[2:3] -; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_ushort v0, v0, s[6:7] +; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1705,6 +1722,7 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v1, v1, s[2:3] ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[4:5] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -1726,6 +1744,7 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v1, v1, s[2:3] ; GFX11-FAKE16-NEXT: global_load_u16 v0, v0, s[4:5] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll index a96d022b66f12..6762277a4a651 100644 --- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -1974,8 +1974,8 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; SI-NOFMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 +; SI-NOFMA-NEXT: s_mov_b32 s18, s10 +; SI-NOFMA-NEXT: s_mov_b32 s19, s11 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s16, s4 ; SI-NOFMA-NEXT: s_mov_b32 s17, s5 @@ -1983,22 +1983,21 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; SI-NOFMA-NEXT: s_mov_b32 s5, s7 ; SI-NOFMA-NEXT: s_mov_b32 s6, s10 ; SI-NOFMA-NEXT: s_mov_b32 s7, s11 +; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NOFMA-NEXT: s_mov_b32 s14, s10 ; SI-NOFMA-NEXT: s_mov_b32 s12, s2 ; SI-NOFMA-NEXT: s_mov_b32 s13, s3 -; SI-NOFMA-NEXT: s_mov_b32 s18, s10 -; SI-NOFMA-NEXT: s_mov_b32 s19, s11 -; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NOFMA-NEXT: s_mov_b32 s15, s11 ; SI-NOFMA-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; SI-NOFMA-NEXT: s_mov_b32 s8, s0 ; SI-NOFMA-NEXT: s_mov_b32 s9, s1 -; SI-NOFMA-NEXT: s_waitcnt vmcnt(2) -; SI-NOFMA-NEXT: v_sub_f32_e32 v3, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) -; SI-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v3 +; SI-NOFMA-NEXT: v_sub_f32_e32 v3, 1.0, v1 +; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v3 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: v_mac_f32_e32 v1, v2, v0 -; SI-NOFMA-NEXT: buffer_store_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: v_mac_f32_e32 v0, v2, v1 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_interp: diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index db0c5362bdc5f..5ab09060403b3 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -8124,6 +8124,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: v_add_f32_e32 v1, 0x41800000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_med3_f32 v2, v2, 1.0, 0x41800000 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8305,6 +8306,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 0x41000000, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_med3_f32 v2, v2, s2, 0x41800000 +; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v3, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8326,6 +8328,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; GFX11-GISEL-NEXT: v_med3_f32 v2, v3, 0x41000000, v2 ; GFX11-GISEL-NEXT: v_add_f32_e32 v3, 0x41800000, v1 ; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 0x41000000, v1 +; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v3, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll index c16fa2d40097d..9c900930c8ac0 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll @@ -286,15 +286,15 @@ define amdgpu_kernel void @fmul_v2f16( ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -305,9 +305,9 @@ define amdgpu_kernel void @fmul_v2f16( ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mul_f32_e32 v2, v3, v2 +; SI-NEXT: v_mul_f32_e32 v2, v2, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -320,20 +320,20 @@ define amdgpu_kernel void @fmul_v2f16( ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s2 ; VI-NEXT: s_mov_b32 s13, s3 -; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_mov_b32 s15, s7 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mul_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_mul_f16_e32 v0, v1, v0 +; VI-NEXT: v_mul_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_f16_e32 v0, v0, v1 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -374,6 +374,7 @@ define amdgpu_kernel void @fmul_v2f16( ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s12, s2 ; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[4:7], 0 ; GFX11-NEXT: s_mov_b32 s8, s0 @@ -586,15 +587,15 @@ define amdgpu_kernel void @fmul_v4f16( ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s10 -; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_mov_b32 s13, s11 -; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -611,10 +612,10 @@ define amdgpu_kernel void @fmul_v4f16( ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_mul_f32_e32 v5, v7, v5 -; SI-NEXT: v_mul_f32_e32 v4, v6, v4 -; SI-NEXT: v_mul_f32_e32 v1, v3, v1 -; SI-NEXT: v_mul_f32_e32 v0, v2, v0 +; SI-NEXT: v_mul_f32_e32 v5, v5, v7 +; SI-NEXT: v_mul_f32_e32 v4, v4, v6 +; SI-NEXT: v_mul_f32_e32 v1, v1, v3 +; SI-NEXT: v_mul_f32_e32 v0, v0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 @@ -632,22 +633,22 @@ define amdgpu_kernel void @fmul_v4f16( ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s2 ; VI-NEXT: s_mov_b32 s13, s3 -; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_mov_b32 s15, s7 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mul_f16_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_mul_f16_e32 v1, v3, v1 -; VI-NEXT: v_mul_f16_sdwa v3, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_mul_f16_e32 v0, v2, v0 +; VI-NEXT: v_mul_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_f16_e32 v1, v1, v3 +; VI-NEXT: v_mul_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_f16_e32 v0, v0, v2 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v3 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -659,20 +660,20 @@ define amdgpu_kernel void @fmul_v4f16( ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s2 ; GFX9-NEXT: s_mov_b32 s13, s3 -; GFX9-NEXT: s_mov_b32 s14, s6 ; GFX9-NEXT: s_mov_b32 s15, s7 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v1, v3, v1 -; GFX9-NEXT: v_pk_mul_f16 v0, v2, v0 +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -683,20 +684,21 @@ define amdgpu_kernel void @fmul_v4f16( ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s12, s2 ; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[4:7], 0 ; GFX11-NEXT: s_mov_b32 s8, s0 ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_mul_f16 v1, v3, v1 -; GFX11-NEXT: v_pk_mul_f16 v0, v2, v0 +; GFX11-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 125d009429cbf..8b28fe04d2205 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -337,21 +337,21 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s6, s10 -; CI-NEXT: s_mov_b32 s7, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; CI-NEXT: s_mov_b32 s8, s0 ; CI-NEXT: s_mov_b32 s9, s1 ; CI-NEXT: s_mov_b32 s0, s2 ; CI-NEXT: s_mov_b32 s1, s3 ; CI-NEXT: s_mov_b32 s2, s10 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: s_mov_b32 s7, s11 ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_rcp_f32_e32 v2, v1 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -522,21 +522,21 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s6, s10 -; CI-NEXT: s_mov_b32 s7, s11 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; CI-NEXT: s_mov_b32 s8, s0 ; CI-NEXT: s_mov_b32 s9, s1 ; CI-NEXT: s_mov_b32 s0, s2 ; CI-NEXT: s_mov_b32 s1, s3 ; CI-NEXT: s_mov_b32 s2, s10 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: s_mov_b32 s7, s11 ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 ; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_rcp_f32_e32 v2, v1 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -2596,16 +2596,16 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 ; VI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 ; VI-NEXT: v_rcp_f32_e32 v10, v9 ; VI-NEXT: v_mul_f32_e32 v11, v7, v10 diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll index a764681645c42..95baeb64ca0de 100644 --- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll @@ -286,15 +286,15 @@ define amdgpu_kernel void @fsub_v2f16( ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -305,9 +305,9 @@ define amdgpu_kernel void @fsub_v2f16( ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_sub_f32_e32 v2, v3, v2 +; SI-NEXT: v_sub_f32_e32 v2, v2, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_sub_f32_e32 v0, v1, v0 +; SI-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -320,20 +320,20 @@ define amdgpu_kernel void @fsub_v2f16( ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s2 ; VI-NEXT: s_mov_b32 s13, s3 -; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_mov_b32 s15, s7 -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_sub_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_sub_f16_e32 v0, v1, v0 +; VI-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_sub_f16_e32 v0, v0, v1 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -374,6 +374,7 @@ define amdgpu_kernel void @fsub_v2f16( ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s12, s2 ; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[4:7], 0 ; GFX11-NEXT: s_mov_b32 s8, s0 diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll index 0db2a1679197e..0795d0c36952d 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll @@ -543,25 +543,25 @@ define void @void_func_v32i32_inreg(<32 x i32> inreg %arg0) #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_mov_b32 v14, v0 -; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off ; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off -; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: v_dual_mov_b32 v12, s28 :: v_dual_mov_b32 v13, s29 -; GFX11-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25 -; GFX11-NEXT: v_dual_mov_b32 v2, s26 :: v_dual_mov_b32 v3, s27 -; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21 -; GFX11-NEXT: v_dual_mov_b32 v6, s22 :: v_dual_mov_b32 v7, s23 -; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17 -; GFX11-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19 -; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 -; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 -; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25 +; GFX11-NEXT: v_dual_mov_b32 v8, s26 :: v_dual_mov_b32 v9, s27 +; GFX11-NEXT: v_dual_mov_b32 v16, s20 :: v_dual_mov_b32 v17, s21 +; GFX11-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23 +; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v23, s19 +; GFX11-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1 +; GFX11-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v27, s3 +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off -; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off ; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off +; GFX11-NEXT: global_store_b128 v[0:1], v[20:23], off +; GFX11-NEXT: global_store_b128 v[0:1], v[24:27], off ; GFX11-NEXT: s_setpc_b64 s[30:31] store <32 x i32> %arg0, ptr addrspace(1) poison ret void @@ -779,25 +779,25 @@ define void @void_func_v16i64_inreg(<16 x i64> inreg %arg0) #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_mov_b32 v14, v0 -; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off ; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off -; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: v_dual_mov_b32 v12, s28 :: v_dual_mov_b32 v13, s29 -; GFX11-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25 -; GFX11-NEXT: v_dual_mov_b32 v2, s26 :: v_dual_mov_b32 v3, s27 -; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21 -; GFX11-NEXT: v_dual_mov_b32 v6, s22 :: v_dual_mov_b32 v7, s23 -; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17 -; GFX11-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19 -; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 -; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 -; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25 +; GFX11-NEXT: v_dual_mov_b32 v8, s26 :: v_dual_mov_b32 v9, s27 +; GFX11-NEXT: v_dual_mov_b32 v16, s20 :: v_dual_mov_b32 v17, s21 +; GFX11-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23 +; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v23, s19 +; GFX11-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1 +; GFX11-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v27, s3 +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off -; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off ; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off +; GFX11-NEXT: global_store_b128 v[0:1], v[20:23], off +; GFX11-NEXT: global_store_b128 v[0:1], v[24:27], off ; GFX11-NEXT: s_setpc_b64 s[30:31] store <16 x i64> %arg0, ptr addrspace(1) poison ret void @@ -1243,25 +1243,25 @@ define void @void_func_v16f64_inreg(<16 x double> inreg %arg0) #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_mov_b32 v14, v0 -; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off ; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off -; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: v_dual_mov_b32 v12, s28 :: v_dual_mov_b32 v13, s29 -; GFX11-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25 -; GFX11-NEXT: v_dual_mov_b32 v2, s26 :: v_dual_mov_b32 v3, s27 -; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21 -; GFX11-NEXT: v_dual_mov_b32 v6, s22 :: v_dual_mov_b32 v7, s23 -; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17 -; GFX11-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19 -; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 -; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 -; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25 +; GFX11-NEXT: v_dual_mov_b32 v8, s26 :: v_dual_mov_b32 v9, s27 +; GFX11-NEXT: v_dual_mov_b32 v16, s20 :: v_dual_mov_b32 v17, s21 +; GFX11-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23 +; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v23, s19 +; GFX11-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1 +; GFX11-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v27, s3 +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off -; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off ; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off +; GFX11-NEXT: global_store_b128 v[0:1], v[20:23], off +; GFX11-NEXT: global_store_b128 v[0:1], v[24:27], off ; GFX11-NEXT: s_setpc_b64 s[30:31] store <16 x double> %arg0, ptr addrspace(1) poison ret void diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 81b8b36180746..bfa50b42881c0 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -2922,8 +2922,8 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:12 ; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:12 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:8 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 @@ -2944,7 +2944,7 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: buffer_store_b32 v34, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3185,9 +3185,9 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x4 ; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 @@ -3208,7 +3208,7 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: buffer_store_b64 v[32:33], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3334,10 +3334,10 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x5 ; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:20 ; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:20 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:16 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 @@ -3358,15 +3358,14 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: s_waitcnt vmcnt(4) ; GFX11-NEXT: buffer_store_b32 v34, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: buffer_store_b32 v35, off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: buffer_store_b32 v36, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b32 v35, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_store_b64 v[32:33], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3422,14 +3421,14 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x8 ; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:20 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt vmcnt(8) @@ -3449,11 +3448,11 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc +; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) poison @@ -3505,13 +3504,13 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x8 ; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16 ; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 ; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:32 ; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:28 ; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16 ; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:20 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 @@ -3532,7 +3531,7 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3698,26 +3697,26 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x10 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:48 ; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:44 ; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:52 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:48 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt vmcnt(11) +; GFX11-NEXT: s_waitcnt vmcnt(13) ; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc @@ -3734,15 +3733,14 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: buffer_store_b128 v[52:55], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: s_waitcnt vmcnt(5) ; GFX11-NEXT: buffer_store_b128 v[48:51], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b128 v[52:55], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3980,39 +3978,39 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:80 ; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:76 ; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:124 -; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v66, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v65, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v71, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v70, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: scratch_load_b32 v69, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v83, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v81, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v80, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v68, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v64, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:84 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v66, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v65, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v70, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v69, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v68, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v83, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v81, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v71, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v80, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v64, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:80 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt vmcnt(15) @@ -4033,26 +4031,22 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: buffer_store_b128 v[84:87], off, s[0:3], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(6) ; GFX11-NEXT: buffer_store_b128 v[80:83], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(5) ; GFX11-NEXT: buffer_store_b128 v[68:71], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: s_waitcnt vmcnt(5) ; GFX11-NEXT: buffer_store_b128 v[64:67], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: buffer_store_b128 v[52:55], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: buffer_store_b128 v[48:51], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b128 v[84:87], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index ca9cb456fa19f..e960827aa3a06 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -15791,6 +15791,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s25, 21 ; GFX11-NEXT: s_mov_b32 s24, s40 ; GFX11-NEXT: s_mov_b32 s25, s41 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b64 off, v[4:5], s2 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: v_writelane_b32 v40, s26, 22 @@ -16232,6 +16233,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX11-NEXT: s_mov_b32 s25, s41 ; GFX11-NEXT: v_writelane_b32 v40, s26, 22 ; GFX11-NEXT: s_mov_b32 s26, s42 +; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: scratch_store_b32 off, v6, s2 ; GFX11-NEXT: scratch_store_b64 off, v[4:5], s3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 @@ -16960,6 +16962,7 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 16 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 @@ -17248,6 +17251,7 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 16 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0 ; GFX11-NEXT: v_mov_b32_e32 v6, 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index 6384fdba7a45a..d14d306baffe6 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -2529,74 +2529,72 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:168 ; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:164 -; GFX11-NEXT: s_clause 0x11 -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:16 +; GFX11-NEXT: s_clause 0x13 ; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:12 ; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:100 ; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_load_b32 v23, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v22, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v21, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v23, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v22, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v21, off, s32 offset:120 ; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:124 -; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:132 ; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_load_b32 v15, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v14, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v13, off, s32 offset:136 +; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: scratch_load_b32 v16, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v15, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v14, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v13, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v20, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:84 ; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-NEXT: s_clause 0xd -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v12, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v16, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v20, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:4 +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: scratch_load_b32 v12, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v11, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v10, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v9, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:16 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 ; GFX11-NEXT: s_waitcnt vmcnt(10) -; GFX11-NEXT: scratch_store_b128 v0, v[60:63], off offset:272 -; GFX11-NEXT: s_waitcnt vmcnt(9) -; GFX11-NEXT: scratch_store_b128 v0, v[12:15], off offset:256 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:272 +; GFX11-NEXT: scratch_store_b128 v0, v[60:63], off offset:256 ; GFX11-NEXT: s_waitcnt vmcnt(8) -; GFX11-NEXT: scratch_store_b128 v0, v[16:19], off offset:240 -; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: scratch_store_b128 v0, v[20:23], off offset:224 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[20:23], off offset:240 +; GFX11-NEXT: scratch_store_b128 v0, v[56:59], off offset:224 ; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: scratch_store_b128 v0, v[56:59], off offset:208 -; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: scratch_store_b128 v0, v[41:44], off offset:192 -; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: scratch_store_b128 v0, v[37:40], off offset:176 -; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:160 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_store_b128 v0, v[41:44], off offset:208 +; GFX11-NEXT: scratch_store_b128 v0, v[37:40], off offset:192 +; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:176 +; GFX11-NEXT: scratch_store_b128 v0, v[48:51], off offset:160 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: scratch_store_b128 v0, v[48:51], off offset:144 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:144 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: scratch_store_b128 v0, v[33:36], off offset:128 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3211,25 +3209,30 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: scratch_store_b32 off, v59, s33 ; GFX11-NEXT: s_add_i32 s0, s32, 0xa0 ; GFX11-NEXT: s_add_i32 s1, s32, 0x90 +; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: scratch_store_b32 off, v4, s0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 ; GFX11-NEXT: s_add_i32 s0, s32, 0x80 ; GFX11-NEXT: s_add_i32 s1, s32, 0x70 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 ; GFX11-NEXT: s_add_i32 s0, s32, 0x60 ; GFX11-NEXT: s_add_i32 s1, s32, 0x50 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 ; GFX11-NEXT: s_add_i32 s0, s32, 64 ; GFX11-NEXT: s_add_i32 s1, s32, 48 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 ; GFX11-NEXT: s_add_i32 s0, s32, 32 ; GFX11-NEXT: s_add_i32 s1, s32, 16 ; GFX11-NEXT: s_add_i32 s2, s33, 0x200 ; GFX11-NEXT: v_writelane_b32 v60, s30, 0 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, 0 @@ -3288,7 +3291,8 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1552 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:608 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1536 ; 16-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1536 ; GFX11-NEXT: scratch_store_b128 off, v[32:35], s32 ; GFX11-NEXT: v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v32, v36 ; GFX11-NEXT: v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 82c58394c03bb..7c3191a7e1f20 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -743,15 +743,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1961,15 +1961,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3239,15 +3239,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -4013,15 +4013,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -5316,15 +5316,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -6096,12 +6096,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 @@ -6111,7 +6108,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_mov_b32 s12, s51 ; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -6166,10 +6167,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] @@ -6178,7 +6177,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_mov_b32 s12, s51 ; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6555,12 +6557,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -6570,7 +6569,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6625,10 +6628,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] @@ -6637,7 +6638,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7149,12 +7153,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 @@ -7164,7 +7165,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b32 s12, s51 ; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -7245,10 +7250,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] @@ -7257,7 +7260,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s12, s51 ; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7288,7 +7294,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -7296,6 +7301,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 @@ -7830,12 +7836,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -7845,7 +7848,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -7945,10 +7952,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] @@ -7957,7 +7962,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8922,15 +8930,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -10355,15 +10363,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -11270,15 +11278,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -12130,12 +12138,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 @@ -12145,7 +12150,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_mov_b32 s12, s51 ; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -12206,10 +12215,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] @@ -12218,7 +12225,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_mov_b32 s12, s51 ; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -12605,12 +12615,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -12620,7 +12627,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -12681,10 +12692,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] @@ -12693,7 +12702,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -13205,12 +13217,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 @@ -13220,7 +13229,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b32 s12, s51 ; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -13301,10 +13314,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] @@ -13313,7 +13324,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s12, s51 ; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -13344,7 +13358,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -13352,6 +13365,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 @@ -13886,12 +13900,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -13901,7 +13912,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -14001,10 +14016,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] @@ -14013,7 +14026,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index f8f911b693e09..2a640dda13ff3 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -645,15 +645,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1684,15 +1684,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2723,15 +2723,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3490,10 +3490,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s52 -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -3555,9 +3556,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -3930,10 +3932,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3995,9 +3998,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -4541,12 +4545,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off @@ -4639,13 +4644,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s12, s51 ; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v2, s52 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off -; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 ; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: v_mov_b32_e32 v5, 8 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off @@ -4675,7 +4682,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -4683,6 +4689,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 @@ -5267,10 +5274,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5388,9 +5396,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6170,15 +6179,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -7036,10 +7045,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s52 -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -7101,9 +7111,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7476,10 +7487,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -7541,9 +7553,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8087,12 +8100,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off @@ -8185,13 +8199,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s12, s51 ; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v2, s52 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off -; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 ; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: v_mov_b32_e32 v5, 8 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off @@ -8221,7 +8237,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -8229,6 +8244,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 @@ -8813,10 +8829,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -8934,9 +8951,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 1f76a476107a3..dab2066706b5a 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -645,15 +645,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1684,15 +1684,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2723,15 +2723,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3490,10 +3490,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s52 -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -3555,9 +3556,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -3930,10 +3932,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3995,9 +3998,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -4541,12 +4545,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off @@ -4639,13 +4644,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s12, s51 ; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v2, s52 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off -; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 ; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: v_mov_b32_e32 v5, 8 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off @@ -4675,7 +4682,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -4683,6 +4689,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 @@ -5267,10 +5274,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5388,9 +5396,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6170,15 +6179,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -7036,10 +7045,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: v_mov_b32_e32 v2, s52 -; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -7101,9 +7111,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_mov_b32 s14, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7476,10 +7487,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -7541,9 +7553,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8087,12 +8100,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 ; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off @@ -8185,13 +8199,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s12, s51 ; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v2, s52 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off -; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 ; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: v_mov_b32_e32 v5, 8 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off @@ -8221,7 +8237,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -8229,6 +8244,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 @@ -8813,10 +8829,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -8934,9 +8951,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 9db3c37045ccf..f9cab76900385 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -829,15 +829,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2159,15 +2159,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3489,15 +3489,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -4315,15 +4315,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -5644,15 +5644,15 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -6424,12 +6424,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 @@ -6439,7 +6436,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_mov_b32 s12, s51 ; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -6494,10 +6495,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] @@ -6506,7 +6505,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_mov_b32 s12, s51 ; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6883,12 +6885,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -6898,7 +6897,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6953,10 +6956,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] @@ -6965,7 +6966,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7477,12 +7481,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 @@ -7492,7 +7493,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b32 s12, s51 ; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -7573,10 +7578,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] @@ -7585,7 +7588,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s12, s51 ; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7616,7 +7622,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -7624,6 +7629,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 @@ -8158,12 +8164,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -8173,7 +8176,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -8273,10 +8280,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] @@ -8285,7 +8290,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -9249,15 +9257,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -10682,15 +10690,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -11597,15 +11605,15 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s8 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s4, 44 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s5, 0 -; GFX7LESS-DPP-NEXT: s_getpc_b64 s[4:5] -; GFX7LESS-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 -; GFX7LESS-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX7LESS-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 @@ -12456,12 +12464,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 @@ -12471,7 +12476,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_mov_b32 s12, s51 ; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -12532,10 +12541,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] @@ -12544,7 +12551,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_mov_b32 s12, s51 ; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -12931,12 +12941,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -12946,7 +12953,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -13007,10 +13018,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] @@ -13019,7 +13028,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -13531,12 +13543,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 @@ -13546,7 +13555,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b32 s12, s51 ; GFX1164-NEXT: s_mov_b32 s13, s50 ; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -13627,10 +13640,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[48:49] @@ -13639,7 +13650,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s12, s51 ; GFX1132-NEXT: s_mov_b32 s13, s50 ; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -13670,7 +13684,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[48:49], s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s55, 0xf000 ; GFX7LESS-DPP-NEXT: s_mov_b32 s54, -1 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 @@ -13678,6 +13691,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[52:53], s[4:5], 0x9 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 @@ -14212,12 +14226,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 @@ -14227,7 +14238,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s52 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s53 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -14327,10 +14342,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[48:49] @@ -14339,7 +14352,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s12, s51 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s50 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s52 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s53 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll index 80d4fa69be425..8eae0db8a577a 100644 --- a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll +++ b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll @@ -9,10 +9,11 @@ define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 in ; GFX11-NEXT: s_mov_b32 m0, s4 ; GFX11-NEXT: s_getpc_b64 s[4:5] ; GFX11-NEXT: s_mov_b32 s0, s1 -; GFX11-NEXT: s_mov_b32 s6, s3 ; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: s_mov_b32 s6, s3 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s7, s5 +; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x0 ; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x0 ; GFX11-NEXT: s_load_b256 s[0:7], s[6:7], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 56ceba258f471..da0db4d1cd1fb 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -22,36 +22,35 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v5, s52, 10 ; CHECK-NEXT: v_writelane_b32 v5, s53, 11 ; CHECK-NEXT: v_writelane_b32 v5, s54, 12 -; CHECK-NEXT: v_writelane_b32 v5, s55, 13 ; CHECK-NEXT: s_getpc_b64 s[24:25] -; CHECK-NEXT: v_writelane_b32 v5, s64, 14 +; CHECK-NEXT: v_writelane_b32 v5, s55, 13 ; CHECK-NEXT: s_movk_i32 s4, 0xf0 ; CHECK-NEXT: s_mov_b32 s5, s24 -; CHECK-NEXT: v_writelane_b32 v5, s65, 15 +; CHECK-NEXT: v_writelane_b32 v5, s64, 14 ; CHECK-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: v_writelane_b32 v5, s66, 16 +; CHECK-NEXT: v_writelane_b32 v5, s65, 15 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v5, s67, 17 +; CHECK-NEXT: v_writelane_b32 v5, s66, 16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_movk_i32 s6, 0x130 ; CHECK-NEXT: s_mov_b32 s7, s24 -; CHECK-NEXT: v_writelane_b32 v5, s68, 18 +; CHECK-NEXT: v_writelane_b32 v5, s67, 17 ; CHECK-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0 +; CHECK-NEXT: v_writelane_b32 v5, s68, 18 ; CHECK-NEXT: v_writelane_b32 v5, s69, 19 ; CHECK-NEXT: v_writelane_b32 v5, s70, 20 ; CHECK-NEXT: s_mov_b32 s68, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_writelane_b32 v5, s71, 21 -; CHECK-NEXT: v_mov_b32_e32 v2, s4 -; CHECK-NEXT: v_mov_b32_e32 v3, v1 +; CHECK-NEXT: v_mov_b32_e32 v3, s4 +; CHECK-NEXT: v_mov_b32_e32 v4, v1 ; CHECK-NEXT: s_mov_b32 s69, s68 ; CHECK-NEXT: s_mov_b32 s70, s68 ; CHECK-NEXT: s_mov_b32 s71, s68 -; CHECK-NEXT: image_sample_lz v3, v[2:3], s[16:23], s[68:71] dmask:0x1 ; CHECK-NEXT: v_mov_b32_e32 v2, v1 ; CHECK-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane -; CHECK-NEXT: s_mov_b32 s6, 48 +; CHECK-NEXT: image_sample_lz v3, v[3:4], s[16:23], s[68:71] dmask:0x1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_writelane_b32 v6, s36, 0 ; CHECK-NEXT: v_writelane_b32 v6, s37, 1 @@ -69,6 +68,7 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v6, s48, 12 ; CHECK-NEXT: v_writelane_b32 v6, s49, 13 ; CHECK-NEXT: v_writelane_b32 v6, s50, 14 +; CHECK-NEXT: s_mov_b32 s6, 48 ; CHECK-NEXT: s_movk_i32 s56, 0x1f0 ; CHECK-NEXT: s_movk_i32 s72, 0x2f0 ; CHECK-NEXT: s_mov_b32 s57, s24 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll index b443e654350c5..88276e46f355a 100644 --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -47,21 +47,21 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0 -; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -187,18 +187,17 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_u32_u24_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX8-NEXT: v_mul_u32_u24_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 @@ -327,21 +326,21 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, v1, v0, s0 +; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -463,21 +462,21 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 16 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s0 +; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -605,21 +604,21 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0 -; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -745,21 +744,21 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 +; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, v1, v0, s0 +; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -885,19 +884,19 @@ define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v0, s0 -; GFX8-NEXT: v_mad_u32_u24 v2, v1, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v1, v1, v1, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1170,20 +1169,20 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, s0 -; GFX8-NEXT: v_mad_u32_u24 v2, v3, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1591,21 +1590,21 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0 -; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1734,21 +1733,21 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0 -; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s0 +; GFX8-NEXT: v_mad_u32_u24 v1, v3, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1887,21 +1886,21 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v1, v2, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, v1, v0, s0 +; GFX8-NEXT: v_mad_i32_i24 v1, v3, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2040,22 +2039,22 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v4, v2, v1, s0 -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4 -; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v4, v3, v2, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, v4 +; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2199,22 +2198,22 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v4, v2, v1, s0 -; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4 -; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v4, v3, v2, s0 +; GFX8-NEXT: v_mad_i32_i24 v0, v1, v0, v4 +; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2358,22 +2357,22 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s0 -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4 -; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v4, v1, v0, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, v4 +; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2515,22 +2514,22 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v4, v0, v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4 -; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v4, v1, v0, s0 +; GFX8-NEXT: v_mad_i32_i24 v0, v1, v0, v4 +; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2645,19 +2644,18 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v3, off, s[4:7], 0 ; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: buffer_load_ushort v1, off, s[4:7], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, v3 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -2671,22 +2669,22 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: flat_load_ushort v4, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: flat_load_ushort v2, v[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v4, v5, v6, v4 -; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 -; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: v_mad_u16 v2, v3, v6, v2 +; GFX8-NEXT: v_mad_u16 v0, v0, v1, v2 +; GFX8-NEXT: flat_store_short v[4:5], v0 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_acc16: @@ -2737,7 +2735,7 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_clause 0x2 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] ; GFX10-DL-NEXT: global_load_ushort v4, v1, s[6:7] @@ -2812,23 +2810,23 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: flat_load_ushort v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 8 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, v1, v0, s0 +; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index d28f0a190e117..9f792c84919b2 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -52,27 +52,27 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 -; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 -; GFX8-NEXT: v_bfe_i32 v6, v3, 16, 8 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 24, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 -; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 8 +; GFX8-NEXT: v_bfe_i32 v4, v0, 8, 8 +; GFX8-NEXT: v_bfe_i32 v5, v1, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0 -; GFX8-NEXT: v_bfe_i32 v7, v0, 16, 8 -; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 +; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, s0 +; GFX8-NEXT: v_bfe_i32 v6, v0, 16, 8 +; GFX8-NEXT: v_bfe_i32 v7, v1, 16, 8 +; GFX8-NEXT: v_mad_i32_i24 v2, v4, v5, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1 -; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 24, v1 +; GFX8-NEXT: v_mad_i32_i24 v2, v6, v7, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, v0, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -205,25 +205,24 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 +; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_i32 v6, v0, 0, 8 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_bfe_i32 v7, v0, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 8 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_bfe_i32 v8, v0, 16, 8 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v6, v3 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 @@ -245,35 +244,35 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: flat_load_ushort v4, v[0:1] +; GFX8-NEXT: flat_load_ushort v3, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_bfe_i32 v7, v3, 0, 8 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 8 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX8-NEXT: v_bfe_i32 v7, v4, 0, 8 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v8, v2, 0, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 8 ; GFX8-NEXT: v_bfe_i32 v10, v10, 0, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v4, v7, v8, v4 +; GFX8-NEXT: v_mad_u16 v3, v7, v8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 8 -; GFX8-NEXT: v_mad_u16 v4, v9, v10, v4 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX8-NEXT: v_mad_u16 v3, v9, v10, v3 +; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX8-NEXT: v_mad_u16 v4, v5, v6, v4 -; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_u16 v3, v5, v6, v3 +; GFX8-NEXT: v_mad_u16 v2, v4, v2, v3 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -318,11 +317,11 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1] ; GFX9-DL-NEXT: global_load_sshort v4, v1, s[6:7] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v0, v2, v3, v4 +; GFX9-DL-NEXT: v_dot4_i32_i8 v0, v3, v2, v4 ; GFX9-DL-NEXT: global_store_short v1, v0, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -334,12 +333,12 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX10-DL-NEXT: s_clause 0x2 +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: global_load_sshort v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_dot4c_i32_i8 v4, v2, v3 +; GFX10-DL-NEXT: v_dot4c_i32_i8 v4, v3, v2 ; GFX10-DL-NEXT: global_store_short v1, v4, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; @@ -352,12 +351,12 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: s_clause 0x2 +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[2:3] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-DL-NEXT: global_load_i16 v3, v1, s[4:5] ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) -; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v2, v0, v3 neg_lo:[1,1,0] +; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v2, v3 neg_lo:[1,1,0] ; GFX11-DL-NEXT: global_store_b16 v1, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, @@ -405,36 +404,35 @@ entry: define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s7 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 -; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v6, v3 ; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_acc8: @@ -446,25 +444,25 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: flat_load_ubyte v4, v[0:1] +; GFX8-NEXT: flat_load_ubyte v3, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_u16 v2, v4, v2, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_mad_u16 v2, v7, v8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v4 ; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2 ; GFX8-NEXT: v_mad_u16 v2, v9, v10, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -503,11 +501,11 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1] ; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 +; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 ; GFX9-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -519,12 +517,12 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX10-DL-NEXT: s_clause 0x2 +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 +; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; @@ -537,12 +535,12 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: s_clause 0x2 +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[2:3] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-DL-NEXT: global_load_u8 v3, v1, s[4:5] ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) -; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3 +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3 ; GFX11-DL-NEXT: global_store_b8 v1, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, @@ -624,28 +622,28 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 -; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 -; GFX8-NEXT: v_bfe_i32 v6, v3, 16, 8 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 24, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 -; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 8 +; GFX8-NEXT: v_bfe_i32 v4, v0, 8, 8 +; GFX8-NEXT: v_bfe_i32 v5, v1, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v8, v1, v2, s0 +; GFX8-NEXT: v_mad_i32_i24 v8, v2, v3, s0 ; GFX8-NEXT: v_mad_i32_i24 v4, v4, v5, v8 -; GFX8-NEXT: v_bfe_i32 v7, v0, 16, 8 -; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v4 +; GFX8-NEXT: v_bfe_i32 v6, v0, 16, 8 +; GFX8-NEXT: v_bfe_i32 v7, v1, 16, 8 +; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, v4 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1 -; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 24, v1 +; GFX8-NEXT: v_mad_i32_i24 v2, v6, v7, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, v0, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -830,27 +828,27 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 24, v3 -; GFX8-NEXT: v_bfe_i32 v5, v3, 16, 8 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b16_e32 v2, 8, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v6, 24, v0 -; GFX8-NEXT: v_bfe_i32 v7, v0, 16, 8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v4, 24, v0 +; GFX8-NEXT: v_bfe_i32 v5, v0, 16, 8 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX8-NEXT: v_ashrrev_i32_e32 v6, 24, v1 +; GFX8-NEXT: v_bfe_i32 v7, v1, 16, 8 +; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s0 -; GFX8-NEXT: v_mad_i32_i24 v0, v1, v2, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, v0, v1, s0 +; GFX8-NEXT: v_mad_i32_i24 v0, v2, v3, v0 ; GFX8-NEXT: v_mad_i32_i24 v0, v5, v7, v0 ; GFX8-NEXT: v_mad_i32_i24 v2, v4, v6, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 @@ -974,17 +972,17 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_i32 v4, v2, 0, 8 -; GFX7-NEXT: v_bfe_i32 v3, v2, 16, 8 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v7, v0, 0, 8 +; GFX7-NEXT: v_bfe_i32 v1, v2, 16, 8 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 24, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_i32 v7, v0, 0, 8 ; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_bfe_i32 v6, v0, 16, 8 @@ -993,14 +991,13 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_mad_u32_u24 v3, v4, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v1, v6, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v8, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1014,26 +1011,26 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: flat_load_ushort v4, v[0:1] +; GFX8-NEXT: flat_load_ushort v3, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX8-NEXT: v_ashrrev_i16_e32 v7, 8, v3 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX8-NEXT: v_ashrrev_i16_e32 v9, 8, v5 -; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_ashrrev_i16_e32 v7, 8, v4 +; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 8, v2 ; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_u16 v2, v4, v2, v3 +; GFX8-NEXT: v_ashrrev_i16_e32 v9, 8, v5 +; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 8, v6 ; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 8 ; GFX8-NEXT: v_mad_u16 v2, v7, v8, v2 @@ -1164,7 +1161,7 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-TRUE16-NEXT: s_clause 0x1 +; GFX11-DL-TRUE16-NEXT: s_clause 0x2 ; GFX11-DL-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[0:1] ; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v3, s[4:5] @@ -1209,7 +1206,7 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-FAKE16-NEXT: s_clause 0x1 +; GFX11-DL-FAKE16-NEXT: s_clause 0x2 ; GFX11-DL-FAKE16-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-DL-FAKE16-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-DL-FAKE16-NEXT: global_load_u16 v3, v2, s[4:5] @@ -1312,21 +1309,21 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 -; GFX8-NEXT: v_bfe_i32 v3, v3, 8, 8 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 8 ; GFX8-NEXT: v_bfe_i32 v0, v0, 8, 8 +; GFX8-NEXT: v_bfe_i32 v1, v1, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0 -; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 +; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, s0 +; GFX8-NEXT: v_mad_i32_i24 v2, v0, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1485,24 +1482,24 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 -; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 -; GFX8-NEXT: v_bfe_i32 v3, v3, 16, 8 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 -; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 8 +; GFX8-NEXT: v_bfe_i32 v4, v0, 8, 8 +; GFX8-NEXT: v_bfe_i32 v5, v1, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0 +; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, s0 ; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8 -; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 -; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 +; GFX8-NEXT: v_bfe_i32 v1, v1, 16, 8 +; GFX8-NEXT: v_mad_i32_i24 v2, v4, v5, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, v0, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1672,24 +1669,24 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 24, v3 -; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 8 -; GFX8-NEXT: v_bfe_i32 v3, v3, 16, 8 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 24, v0 -; GFX8-NEXT: v_bfe_i32 v5, v0, 0, 8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 24, v1 +; GFX8-NEXT: v_bfe_i32 v4, v0, 0, 8 +; GFX8-NEXT: v_bfe_i32 v5, v1, 0, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0 +; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, s0 ; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8 -; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 -; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 +; GFX8-NEXT: v_bfe_i32 v1, v1, 16, 8 +; GFX8-NEXT: v_mad_i32_i24 v2, v4, v5, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, v0, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1859,25 +1856,25 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 8 -; GFX8-NEXT: v_bfe_i32 v7, v3, 16, 8 +; GFX8-NEXT: v_bfe_i32 v3, v4, 0, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v5, v2, 0, 8 -; GFX8-NEXT: v_mul_i32_i24_sdwa v6, sext(v3), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX8-NEXT: v_mul_i32_i24_sdwa v6, sext(v4), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX8-NEXT: v_bfe_i32 v7, v4, 16, 8 ; GFX8-NEXT: v_bfe_i32 v8, v2, 16, 8 -; GFX8-NEXT: v_mad_i32_i24 v4, v4, v5, v6 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 24, v3 +; GFX8-NEXT: v_mad_i32_i24 v3, v3, v5, v6 +; GFX8-NEXT: v_ashrrev_i32_e32 v4, 24, v4 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 24, v2 -; GFX8-NEXT: v_mad_i32_i24 v4, v7, v8, v4 -; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_i32_i24 v3, v7, v8, v3 +; GFX8-NEXT: v_mad_i32_i24 v2, v4, v2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2030,36 +2027,36 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; GFX8-LABEL: idot4_acc32_3src: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 -; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8 +; GFX8-NEXT: v_bfe_i32 v1, v5, 0, 8 +; GFX8-NEXT: v_bfe_i32 v3, v5, 8, 8 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_bfe_i32 v2, v2, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mad_i32_i24 v1, v1, v1, s0 -; GFX8-NEXT: v_bfe_i32 v5, v3, 16, 8 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 24, v3 -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v4, v4, 8, 8 -; GFX8-NEXT: v_mad_i32_i24 v1, v2, v4, v1 +; GFX8-NEXT: v_bfe_i32 v4, v5, 16, 8 +; GFX8-NEXT: v_mad_i32_i24 v1, v3, v2, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v5, 24, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v6, v0, 16, 8 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX8-NEXT: v_mad_i32_i24 v1, v5, v6, v1 -; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 +; GFX8-NEXT: v_mad_i32_i24 v1, v4, v6, v1 +; GFX8-NEXT: v_mad_i32_i24 v2, v5, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2236,33 +2233,33 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX8-LABEL: idot4_acc32_3src_3ele: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 -; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8 +; GFX8-NEXT: v_bfe_i32 v1, v5, 0, 8 +; GFX8-NEXT: v_bfe_i32 v3, v5, 8, 8 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_bfe_i32 v2, v2, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mad_i32_i24 v1, v1, v1, s0 -; GFX8-NEXT: v_bfe_i32 v3, v3, 16, 8 -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v4, v4, 8, 8 -; GFX8-NEXT: v_mad_i32_i24 v1, v2, v4, v1 +; GFX8-NEXT: v_bfe_i32 v4, v5, 16, 8 +; GFX8-NEXT: v_mad_i32_i24 v1, v3, v2, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8 -; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 +; GFX8-NEXT: v_mad_i32_i24 v2, v4, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2655,24 +2652,24 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 -; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 -; GFX8-NEXT: v_bfe_i32 v3, v3, 16, 8 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 -; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 8 +; GFX8-NEXT: v_bfe_i32 v4, v0, 8, 8 +; GFX8-NEXT: v_bfe_i32 v5, v1, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0 +; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, s0 +; GFX8-NEXT: v_bfe_i32 v1, v1, 16, 8 ; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8 -; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 -; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 +; GFX8-NEXT: v_mad_i32_i24 v2, v4, v5, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, v0, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2842,32 +2839,32 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX8-LABEL: idot4_acc32_3src_3ele_src0: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8 +; GFX8-NEXT: v_bfe_i32 v3, v5, 8, 8 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v1, v4, 8, 8 -; GFX8-NEXT: v_bfe_i32 v3, v4, 16, 8 +; GFX8-NEXT: v_bfe_i32 v1, v2, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mad_i32_i24 v4, v1, v1, s0 -; GFX8-NEXT: v_mad_i32_i24 v1, v2, v1, v4 +; GFX8-NEXT: v_bfe_i32 v2, v2, 16, 8 +; GFX8-NEXT: v_mad_i32_i24 v1, v3, v1, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8 -; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 +; GFX8-NEXT: v_mad_i32_i24 v2, v2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -3049,42 +3046,42 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; GFX8-LABEL: idot4_4src: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s10, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v6 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s13 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s12, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s10, v6 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, s13 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s12, v6 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] +; GFX8-NEXT: flat_load_dword v3, v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, s15 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v6 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 -; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2 +; GFX8-NEXT: v_bfe_i32 v1, v7, 0, 8 +; GFX8-NEXT: v_bfe_i32 v4, v7, 8, 8 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_bfe_i32 v3, v4, 0, 8 -; GFX8-NEXT: v_bfe_i32 v4, v4, 8, 8 -; GFX8-NEXT: v_mad_i32_i24 v1, v3, v4, v1 +; GFX8-NEXT: v_bfe_i32 v5, v2, 0, 8 +; GFX8-NEXT: v_bfe_i32 v2, v2, 8, 8 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mad_i32_i24 v1, v1, v4, s2 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v6, v5, 0, 8 -; GFX8-NEXT: v_bfe_i32 v5, v5, 8, 8 -; GFX8-NEXT: v_mad_i32_i24 v1, v6, v5, v1 +; GFX8-NEXT: v_bfe_i32 v6, v3, 0, 8 +; GFX8-NEXT: v_bfe_i32 v3, v3, 8, 8 +; GFX8-NEXT: v_mad_i32_i24 v1, v5, v2, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v7, v0, 0, 8 ; GFX8-NEXT: v_bfe_i32 v0, v0, 8, 8 +; GFX8-NEXT: v_mad_i32_i24 v1, v6, v3, v1 ; GFX8-NEXT: v_mad_i32_i24 v2, v7, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -3297,34 +3294,34 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, 0xff ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 8 -; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX8-NEXT: v_mul_lo_u16_sdwa v6, sext(v3), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_mul_lo_u16_sdwa v6, sext(v4), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 8 ; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_and_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 +; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX8-NEXT: v_mad_u16 v6, v8, v7, v6 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX8-NEXT: v_mad_u16 v4, v4, v5, v6 +; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8 +; GFX8-NEXT: v_mad_u16 v3, v3, v5, v6 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_u16 v2, v4, v2, v3 ; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 82d62910bcb00..3ebee71515c68 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -52,27 +52,27 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1 +; GFX8-NEXT: v_bfe_u32 v4, v0, 8, 8 +; GFX8-NEXT: v_bfe_u32 v5, v1, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 -; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1 +; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, s0 +; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8 +; GFX8-NEXT: v_bfe_u32 v7, v1, 16, 8 +; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1 -; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX8-NEXT: v_mad_u32_u24 v2, v6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -195,36 +195,35 @@ entry: define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s7 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: buffer_load_ushort v1, off, s[4:7], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 -; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v6, v3 ; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_acc16: @@ -237,31 +236,31 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: flat_load_ushort v4, v[0:1] +; GFX8-NEXT: flat_load_ushort v3, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 -; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX8-NEXT: v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v7, 0xff, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX8-NEXT: v_and_b32_e32 v9, 0xff, v9 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v4, v6, v7, v4 +; GFX8-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX8-NEXT: v_and_b32_sdwa v10, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mad_u16 v4, v8, v9, v4 +; GFX8-NEXT: v_mad_u16 v3, v8, v9, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_mad_u16 v4, v10, v5, v4 -; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_u16 v3, v10, v5, v3 +; GFX8-NEXT: v_mad_u16 v2, v4, v2, v3 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -303,11 +302,11 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1] ; GFX9-DL-NEXT: global_load_ushort v4, v1, s[6:7] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 +; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 ; GFX9-DL-NEXT: global_store_short v1, v0, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -319,12 +318,12 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX10-DL-NEXT: s_clause 0x2 +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: global_load_ushort v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 +; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 ; GFX10-DL-NEXT: global_store_short v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; @@ -337,12 +336,12 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: s_clause 0x2 +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[2:3] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-DL-NEXT: global_load_u16 v3, v1, s[4:5] ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) -; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3 +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3 ; GFX11-DL-NEXT: global_store_b16 v1, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, @@ -391,36 +390,35 @@ entry: define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s7 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 -; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v6, v3 ; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_acc8: @@ -432,25 +430,25 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: flat_load_ubyte v4, v[0:1] +; GFX8-NEXT: flat_load_ubyte v3, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_u16 v2, v4, v2, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_mad_u16 v2, v7, v8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v4 ; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2 ; GFX8-NEXT: v_mad_u16 v2, v9, v10, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -489,11 +487,11 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1] ; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 +; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 ; GFX9-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -505,12 +503,12 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX10-DL-NEXT: s_clause 0x2 +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 +; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; @@ -523,12 +521,12 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: s_clause 0x2 +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[2:3] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-DL-NEXT: global_load_u8 v3, v1, s[4:5] ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) -; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3 +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3 ; GFX11-DL-NEXT: global_store_b8 v1, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, @@ -578,19 +576,18 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 ; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, v3 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -604,22 +601,22 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: flat_load_ubyte v4, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] +; GFX8-NEXT: flat_load_ubyte v2, v[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 -; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2 -; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: v_mad_u16 v0, v0, v1, v2 +; GFX8-NEXT: v_mad_u16 v0, v3, v6, v0 +; GFX8-NEXT: flat_store_byte v[4:5], v0 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_8: @@ -647,19 +644,19 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0100 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 +; GFX9-DL-NEXT: v_perm_b32 v0, v2, v2, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s0 +; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] +; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v4 +; GFX9-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_8: @@ -668,19 +665,19 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: s_clause 0x2 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100 +; GFX10-DL-NEXT: v_perm_b32 v0, v2, v2, 0xc0c0100 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0100 +; GFX10-DL-NEXT: v_perm_b32 v2, v3, v3, 0xc0c0100 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX10-DL-NEXT: global_store_byte v0, v1, s[6:7] +; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v4 +; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot2_8: @@ -688,23 +685,22 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] -; GFX11-DL-NEXT: global_load_u8 v3, v2, s[4:5] +; GFX11-DL-NEXT: s_clause 0x2 +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[2:3] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-DL-NEXT: global_load_u8 v3, v1, s[4:5] ; GFX11-DL-NEXT: s_waitcnt vmcnt(2) -; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100 +; GFX11-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0100 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0100 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v3 -; GFX11-DL-NEXT: global_store_b8 v2, v0, s[4:5] +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3 +; GFX11-DL-NEXT: global_store_b8 v1, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -733,36 +729,35 @@ entry: define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_CommutationInsideMAD: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s7 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1 -; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v1, v3 ; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v5, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 -; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_CommutationInsideMAD: @@ -774,25 +769,25 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: flat_load_ubyte v4, v[0:1] +; GFX8-NEXT: flat_load_ubyte v3, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v2, v2, v3, v4 +; GFX8-NEXT: v_mad_u16 v2, v2, v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_mad_u16 v2, v8, v7, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v4 ; GFX8-NEXT: v_mad_u16 v2, v6, v5, v2 ; GFX8-NEXT: v_mad_u16 v2, v10, v9, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -831,11 +826,11 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1] ; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 +; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 ; GFX9-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -847,12 +842,12 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX10-DL-NEXT: s_clause 0x2 +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 +; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; @@ -865,12 +860,12 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: s_clause 0x2 +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[2:3] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-DL-NEXT: global_load_u8 v3, v1, s[4:5] ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) -; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3 +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3 ; GFX11-DL-NEXT: global_store_b8 v1, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, @@ -911,36 +906,35 @@ entry: define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_CommutationAccrossMADs: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s7 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 -; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1 -; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GFX7-NEXT: v_mad_u32_u24 v3, v7, v4, v3 ; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 -; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v1, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v5, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 -; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_CommutationAccrossMADs: @@ -952,25 +946,25 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: flat_load_ubyte v4, v[0:1] +; GFX8-NEXT: flat_load_ubyte v3, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v4, v8, v7, v4 -; GFX8-NEXT: v_mad_u16 v2, v2, v3, v4 +; GFX8-NEXT: v_mad_u16 v3, v8, v7, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX8-NEXT: v_mad_u16 v2, v2, v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v4 ; GFX8-NEXT: v_mad_u16 v2, v6, v5, v2 ; GFX8-NEXT: v_mad_u16 v2, v10, v9, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -1009,11 +1003,11 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1] ; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 +; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 ; GFX9-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -1025,12 +1019,12 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] +; GFX10-DL-NEXT: s_clause 0x2 +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 +; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; @@ -1043,12 +1037,12 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] -; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-DL-NEXT: s_clause 0x2 +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[2:3] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-DL-NEXT: global_load_u8 v3, v1, s[4:5] ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) -; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3 +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3 ; GFX11-DL-NEXT: global_store_b8 v1, v0, s[4:5] ; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, @@ -1131,28 +1125,28 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1 +; GFX8-NEXT: v_bfe_u32 v4, v0, 8, 8 +; GFX8-NEXT: v_bfe_u32 v5, v1, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v8, v1, v2, s0 +; GFX8-NEXT: v_mad_u32_u24 v8, v2, v3, s0 ; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v8 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 -; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4 +; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8 +; GFX8-NEXT: v_bfe_u32 v7, v1, 16, 8 +; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1 -; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX8-NEXT: v_mad_u32_u24 v2, v6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1339,28 +1333,28 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_bfe_u32 v4, v0, 8, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX8-NEXT: v_bfe_u32 v5, v1, 8, 8 ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s0 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 -; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4 +; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8 +; GFX8-NEXT: v_bfe_u32 v7, v1, 16, 8 +; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX8-NEXT: v_mad_u32_u24 v2, v6, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v4 -; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1514,24 +1508,23 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 +; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_i32 v6, v0, 0, 8 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 +; GFX7-NEXT: v_mad_u32_u24 v3, v4, v7, v3 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v6, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 @@ -1549,31 +1542,31 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: flat_load_ushort v4, v[0:1] +; GFX8-NEXT: flat_load_ushort v3, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 -; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX8-NEXT: v_bfe_i32 v6, v3, 0, 8 -; GFX8-NEXT: v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX8-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX8-NEXT: v_bfe_i32 v6, v4, 0, 8 ; GFX8-NEXT: v_bfe_i32 v7, v2, 0, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v4, v8, v9, v4 +; GFX8-NEXT: v_mad_u16 v3, v8, v9, v3 +; GFX8-NEXT: v_and_b32_sdwa v10, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mad_u16 v4, v6, v7, v4 +; GFX8-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_mad_u16 v4, v10, v5, v4 -; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_u16 v3, v10, v5, v3 +; GFX8-NEXT: v_mad_u16 v2, v4, v2, v3 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1674,7 +1667,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-TRUE16-NEXT: s_clause 0x1 +; GFX11-DL-TRUE16-NEXT: s_clause 0x2 ; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[0:1] ; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[2:3] ; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v5, s[4:5] @@ -1793,26 +1786,25 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 +; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 8 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_i32 v7, v0, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 +; GFX7-NEXT: v_mad_u32_u24 v3, v4, v7, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v6, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 @@ -1970,7 +1962,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-TRUE16-NEXT: s_clause 0x1 +; GFX11-DL-TRUE16-NEXT: s_clause 0x2 ; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[2:3] ; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[0:1] ; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v4, s[4:5] @@ -2129,27 +2121,27 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3 -; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 8 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, 8, v3 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 8 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, 8, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX8-NEXT: v_bfe_u32 v6, v1, 16, 8 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, s0 ; GFX8-NEXT: v_mad_u32_u24 v0, v5, v7, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v4, v6, v0 -; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0 +; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2267,29 +2259,28 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v2 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 -; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX7-NEXT: v_bfe_u32 v2, v0, 8, 8 ; GFX7-NEXT: v_alignbit_b32 v0, v6, v0, 16 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v7, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX7-NEXT: v_mad_u32_u24 v3, v5, v7, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v3, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v2, v4, v2, v3 +; GFX7-NEXT: v_mad_u32_u24 v0, v1, v0, v2 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v5, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -2449,7 +2440,7 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-TRUE16-NEXT: s_clause 0x1 +; GFX11-DL-TRUE16-NEXT: s_clause 0x2 ; GFX11-DL-TRUE16-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[2:3] ; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v3, s[4:5] @@ -2554,36 +2545,35 @@ entry: define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, s7 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 +; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v8, v0, 8, 8 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX7-NEXT: v_mad_u32_u24 v3, v4, v7, v3 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: v_mad_u32_u24 v3, v5, v8, v3 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3 +; GFX7-NEXT: v_mad_u32_u24 v0, v1, v6, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_acc8_vecMul: @@ -2595,27 +2585,27 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: flat_load_ubyte v4, v[0:1] +; GFX8-NEXT: flat_load_ubyte v3, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 +; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX8-NEXT: v_mul_lo_u16_e32 v9, v5, v6 ; GFX8-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v7 ; GFX8-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v8 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_u16 v2, v4, v2, v3 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v8 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v7 ; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2 @@ -2724,7 +2714,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-TRUE16-NEXT: s_clause 0x1 +; GFX11-DL-TRUE16-NEXT: s_clause 0x2 ; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[0:1] ; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[2:3] ; GFX11-DL-TRUE16-NEXT: global_load_d16_u8 v0, v5, s[4:5] @@ -2768,7 +2758,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-FAKE16-NEXT: s_clause 0x1 +; GFX11-DL-FAKE16-NEXT: s_clause 0x2 ; GFX11-DL-FAKE16-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-DL-FAKE16-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-DL-FAKE16-NEXT: global_load_u8 v3, v2, s[4:5] @@ -2868,21 +2858,21 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GFX8-NEXT: v_bfe_u32 v3, v3, 8, 8 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX8-NEXT: v_bfe_u32 v0, v0, 8, 8 +; GFX8-NEXT: v_bfe_u32 v1, v1, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0 -; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -3039,24 +3029,24 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 -; GFX8-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1 +; GFX8-NEXT: v_bfe_u32 v4, v0, 8, 8 +; GFX8-NEXT: v_bfe_u32 v5, v1, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, s0 ; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1 -; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -3224,24 +3214,24 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3 -; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v3 -; GFX8-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, s0 ; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1 -; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 +; GFX8-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -3411,25 +3401,25 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v3 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 8 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v2 -; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 8 ; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 8 -; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_mad_u32_u24 v3, v3, v5, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_mad_u32_u24 v4, v7, v8, v4 -; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_u32_u24 v3, v7, v8, v3 +; GFX8-NEXT: v_mad_u32_u24 v2, v4, v2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -3581,36 +3571,36 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; GFX8-LABEL: udot4_acc32_3src: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v5 +; GFX8-NEXT: v_bfe_u32 v3, v5, 8, 8 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_bfe_u32 v2, v2, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v1, v1, v1, s0 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 8 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_u32 v4, v4, 8, 8 -; GFX8-NEXT: v_mad_u32_u24 v1, v2, v4, v1 +; GFX8-NEXT: v_bfe_u32 v4, v5, 16, 8 +; GFX8-NEXT: v_mad_u32_u24 v1, v3, v2, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX8-NEXT: v_mad_u32_u24 v1, v5, v6, v1 -; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v1, v4, v6, v1 +; GFX8-NEXT: v_mad_u32_u24 v2, v5, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -3787,33 +3777,33 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX8-LABEL: udot4_acc32_3src_3ele: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v5 +; GFX8-NEXT: v_bfe_u32 v3, v5, 8, 8 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_bfe_u32 v2, v2, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v1, v1, v1, s0 -; GFX8-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_u32 v4, v4, 8, 8 -; GFX8-NEXT: v_mad_u32_u24 v1, v2, v4, v1 +; GFX8-NEXT: v_bfe_u32 v4, v5, 16, 8 +; GFX8-NEXT: v_mad_u32_u24 v1, v3, v2, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v2, v4, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -4207,24 +4197,24 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 -; GFX8-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1 +; GFX8-NEXT: v_bfe_u32 v4, v0, 8, 8 +; GFX8-NEXT: v_bfe_u32 v5, v1, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, s0 +; GFX8-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1 -; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -4393,32 +4383,32 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX8-LABEL: udot4_acc32_3src_3ele_src0: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8 +; GFX8-NEXT: v_bfe_u32 v3, v5, 8, 8 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_u32 v1, v4, 8, 8 -; GFX8-NEXT: v_bfe_u32 v3, v4, 16, 8 +; GFX8-NEXT: v_bfe_u32 v1, v2, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v4, v1, v1, s0 -; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, v4 +; GFX8-NEXT: v_bfe_u32 v2, v2, 16, 8 +; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -4599,42 +4589,42 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; GFX8-LABEL: udot4_4src: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s10, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s13 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s12, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v6 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s10, v6 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, s13 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s12, v6 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] +; GFX8-NEXT: flat_load_dword v3, v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v1, s15 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v6 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v7 +; GFX8-NEXT: v_bfe_u32 v4, v7, 8, 8 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX8-NEXT: v_bfe_u32 v4, v4, 8, 8 -; GFX8-NEXT: v_mad_u32_u24 v1, v3, v4, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v2 +; GFX8-NEXT: v_bfe_u32 v2, v2, 8, 8 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mad_u32_u24 v1, v1, v4, s2 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v5 -; GFX8-NEXT: v_bfe_u32 v5, v5, 8, 8 -; GFX8-NEXT: v_mad_u32_u24 v1, v6, v5, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v3 +; GFX8-NEXT: v_bfe_u32 v3, v3, 8, 8 +; GFX8-NEXT: v_mad_u32_u24 v1, v5, v2, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v7, 0xff, v0 ; GFX8-NEXT: v_bfe_u32 v0, v0, 8, 8 +; GFX8-NEXT: v_mad_u32_u24 v1, v6, v3, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v7, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -5109,24 +5099,24 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v2, v[2:3] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v2 -; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 8 -; GFX8-NEXT: v_mad_u32_u24 v3, v6, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v3 +; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 8 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 8 +; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v6 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 -; GFX8-NEXT: v_mad_u32_u24 v3, v8, v5, v3 -; GFX8-NEXT: v_mad_u32_u24 v2, v2, v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_mad_u32_u24 v4, v7, v8, v4 +; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v4 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -5471,24 +5461,24 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v2 -; GFX8-NEXT: v_bfe_u32 v7, v2, 8, 8 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 8 -; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v3 -; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_bfe_u32 v3, v3, 8, 8 -; GFX8-NEXT: v_mad_u32_u24 v4, v7, v8, v4 -; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v4 +; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX8-NEXT: v_bfe_u32 v7, v4, 8, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX8-NEXT: v_mad_u32_u24 v3, v3, v5, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 +; GFX8-NEXT: v_bfe_u32 v2, v2, 8, 8 +; GFX8-NEXT: v_mad_u32_u24 v3, v7, v8, v3 +; GFX8-NEXT: v_mad_u32_u24 v2, v4, v2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -5895,6 +5885,7 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2 killed $sgpr3 ; GFX10-DL-NEXT: ; kill: killed $vgpr5 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] ; GFX10-DL-NEXT: global_load_dword v0, v5, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -5915,6 +5906,7 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b128 v[0:3], v1, s[0:1] ; GFX11-DL-NEXT: global_load_b32 v0, v4, s[2:3] ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) @@ -6011,29 +6003,29 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s0, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_movk_i32 s0, 0xfc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dword v2, v[2:3] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1 -; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v2 -; GFX8-NEXT: v_mad_u32_u24 v3, v6, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v3 +; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xff, v2 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 8 +; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v6 ; GFX8-NEXT: v_bfe_u32 v2, v2, 8, 8 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 -; GFX8-NEXT: v_mad_u32_u24 v3, v8, v5, v3 -; GFX8-NEXT: v_mad_u32_u24 v2, v2, v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_mad_u32_u24 v4, v7, v8, v4 +; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v4 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -6087,6 +6079,7 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v1, s[2:3] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1] offset:252 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -6107,6 +6100,7 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 3, v0 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v1, s[2:3] ; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] offset:252 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index b9d3763e7def1..e94959b39ad35 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -72,43 +72,43 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 ; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4 -; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4 -; GFX8-NEXT: v_bfe_i32 v6, v3, 8, 4 -; GFX8-NEXT: v_bfe_i32 v8, v3, 12, 4 -; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4 -; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4 -; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 4 +; GFX8-NEXT: v_bfe_i32 v4, v0, 4, 4 +; GFX8-NEXT: v_bfe_i32 v5, v1, 4, 4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0 -; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4 -; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 -; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4 -; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1 -; GFX8-NEXT: v_bfe_i32 v11, v0, 16, 4 -; GFX8-NEXT: v_mad_i32_i24 v1, v8, v9, v1 -; GFX8-NEXT: v_bfe_i32 v13, v0, 20, 4 -; GFX8-NEXT: v_mad_i32_i24 v1, v10, v11, v1 -; GFX8-NEXT: v_bfe_i32 v14, v3, 24, 4 -; GFX8-NEXT: v_bfe_i32 v15, v0, 24, 4 -; GFX8-NEXT: v_mad_i32_i24 v1, v12, v13, v1 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 28, v3 +; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, s0 +; GFX8-NEXT: v_bfe_i32 v6, v0, 8, 4 +; GFX8-NEXT: v_bfe_i32 v7, v1, 8, 4 +; GFX8-NEXT: v_mad_i32_i24 v2, v4, v5, v2 +; GFX8-NEXT: v_bfe_i32 v8, v0, 12, 4 +; GFX8-NEXT: v_bfe_i32 v9, v1, 12, 4 +; GFX8-NEXT: v_mad_i32_i24 v2, v6, v7, v2 +; GFX8-NEXT: v_bfe_i32 v10, v0, 16, 4 +; GFX8-NEXT: v_bfe_i32 v11, v1, 16, 4 +; GFX8-NEXT: v_mad_i32_i24 v2, v8, v9, v2 +; GFX8-NEXT: v_bfe_i32 v12, v0, 20, 4 +; GFX8-NEXT: v_bfe_i32 v13, v1, 20, 4 +; GFX8-NEXT: v_mad_i32_i24 v2, v10, v11, v2 +; GFX8-NEXT: v_bfe_i32 v14, v0, 24, 4 +; GFX8-NEXT: v_bfe_i32 v15, v1, 24, 4 +; GFX8-NEXT: v_mad_i32_i24 v2, v12, v13, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0 -; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1 -; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 28, v1 +; GFX8-NEXT: v_mad_i32_i24 v2, v14, v15, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, v0, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -320,26 +320,25 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4 +; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4 ; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v10, v3 ; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 @@ -690,7 +689,7 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_clause 0x2 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9] ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[10:11] ; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] @@ -842,26 +841,25 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4 +; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4 ; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10 ; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 ; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v10, v3 ; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 ; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 @@ -1212,7 +1210,7 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_clause 0x2 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9] ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[10:11] ; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v3, v2, s[0:1] @@ -1414,44 +1412,44 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 ; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4 -; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4 -; GFX8-NEXT: v_bfe_i32 v6, v3, 8, 4 -; GFX8-NEXT: v_bfe_i32 v8, v3, 12, 4 -; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4 -; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v16, v1, v2, s0 -; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4 -; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v16 -; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4 -; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 -; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4 -; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1 -; GFX8-NEXT: v_bfe_i32 v11, v0, 16, 4 -; GFX8-NEXT: v_mad_i32_i24 v1, v8, v9, v1 -; GFX8-NEXT: v_bfe_i32 v13, v0, 20, 4 -; GFX8-NEXT: v_mad_i32_i24 v1, v10, v11, v1 -; GFX8-NEXT: v_bfe_i32 v14, v3, 24, 4 -; GFX8-NEXT: v_bfe_i32 v15, v0, 24, 4 -; GFX8-NEXT: v_mad_i32_i24 v1, v12, v13, v1 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 28, v3 +; GFX8-NEXT: v_mad_i32_i24 v16, v2, v3, s0 +; GFX8-NEXT: v_bfe_i32 v4, v0, 4, 4 +; GFX8-NEXT: v_bfe_i32 v5, v1, 4, 4 +; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, v16 +; GFX8-NEXT: v_bfe_i32 v6, v0, 8, 4 +; GFX8-NEXT: v_bfe_i32 v7, v1, 8, 4 +; GFX8-NEXT: v_mad_i32_i24 v2, v4, v5, v2 +; GFX8-NEXT: v_bfe_i32 v8, v0, 12, 4 +; GFX8-NEXT: v_bfe_i32 v9, v1, 12, 4 +; GFX8-NEXT: v_mad_i32_i24 v2, v6, v7, v2 +; GFX8-NEXT: v_bfe_i32 v10, v0, 16, 4 +; GFX8-NEXT: v_bfe_i32 v11, v1, 16, 4 +; GFX8-NEXT: v_mad_i32_i24 v2, v8, v9, v2 +; GFX8-NEXT: v_bfe_i32 v12, v0, 20, 4 +; GFX8-NEXT: v_bfe_i32 v13, v1, 20, 4 +; GFX8-NEXT: v_mad_i32_i24 v2, v10, v11, v2 +; GFX8-NEXT: v_bfe_i32 v14, v0, 24, 4 +; GFX8-NEXT: v_bfe_i32 v15, v1, 24, 4 +; GFX8-NEXT: v_mad_i32_i24 v2, v12, v13, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0 -; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1 -; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 28, v1 +; GFX8-NEXT: v_mad_i32_i24 v2, v14, v15, v2 +; GFX8-NEXT: v_mad_i32_i24 v0, v0, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v16, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2015,18 +2013,18 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_i32 v6, v2, 0, 4 -; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v13, v0, 0, 4 +; GFX7-NEXT: v_bfe_i32 v1, v2, 24, 4 ; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_i32 v13, v0, 0, 4 ; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 ; GFX7-NEXT: v_ashrrev_i32_e32 v7, 28, v2 ; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4 @@ -2043,11 +2041,10 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 +; GFX7-NEXT: v_mad_u32_u24 v3, v6, v13, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 @@ -2057,12 +2054,12 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -2427,7 +2424,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_clause 0x2 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9] ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[10:11] ; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] @@ -2560,18 +2557,18 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_i32 v7, v2, 0, 4 -; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v14, v0, 0, 4 +; GFX7-NEXT: v_bfe_i32 v1, v2, 24, 4 ; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_i32 v14, v0, 0, 4 ; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4 ; GFX7-NEXT: v_bfe_i32 v6, v2, 8, 4 ; GFX7-NEXT: v_ashrrev_i32_e32 v8, 28, v2 @@ -2588,11 +2585,10 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 +; GFX7-NEXT: v_mad_u32_u24 v3, v7, v14, v3 ; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3 ; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v9 ; GFX7-NEXT: v_and_b32_e32 v16, 0xff, v16 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 @@ -2602,12 +2598,12 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -2910,7 +2906,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-XNACK-NEXT: s_clause 0x1 +; GFX10-DL-XNACK-NEXT: s_clause 0x2 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[8:9] ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[10:11] ; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v4, s[0:1] @@ -3011,7 +3007,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 +; GFX10-DL-NOXNACK-NEXT: s_clause 0x2 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[8:9] ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[10:11] ; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v4, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index 50f0a39802270..97b5481a50caf 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -296,14 +296,14 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 ; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 ; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 @@ -311,7 +311,7 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 ; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 ; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 ; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 @@ -320,15 +320,14 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 ; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3 ; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -491,7 +490,7 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_clause 0x2 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] ; GFX10-DL-NEXT: global_load_ushort v4, v1, s[6:7] @@ -614,14 +613,14 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 ; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 ; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 @@ -629,7 +628,7 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 ; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 ; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 ; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 @@ -638,15 +637,14 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 ; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3 ; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -809,7 +807,7 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_clause 0x2 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] @@ -932,14 +930,14 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 ; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 ; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 @@ -947,7 +945,7 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 ; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 ; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 ; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 @@ -956,15 +954,14 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 ; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3 ; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1131,7 +1128,7 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_clause 0x2 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] @@ -1239,14 +1236,14 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 ; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 ; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 @@ -1254,7 +1251,7 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 ; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 ; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 ; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 @@ -1263,15 +1260,14 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 ; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3 ; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1438,7 +1434,7 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_clause 0x2 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] @@ -2120,14 +2116,14 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v3, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 ; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 ; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 @@ -2135,7 +2131,7 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 ; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 ; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 ; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 @@ -2144,15 +2140,14 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 ; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3 ; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2444,14 +2439,14 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 ; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 ; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 @@ -2459,7 +2454,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 ; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 ; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 ; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 @@ -2468,15 +2463,14 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 ; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3 ; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2697,7 +2691,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_clause 0x2 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[6:7] @@ -2807,14 +2801,14 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 ; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 ; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 @@ -2822,7 +2816,7 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 ; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 ; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 ; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 @@ -2831,15 +2825,14 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 ; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3 ; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -3171,33 +3164,33 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, 15, v3 -; GFX8-NEXT: v_bfe_u32 v4, v3, 4, 4 -; GFX8-NEXT: v_bfe_u32 v6, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v10, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v12, v3, 20, 4 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX8-NEXT: v_bfe_u32 v5, v0, 4, 4 -; GFX8-NEXT: v_bfe_u32 v7, v0, 8, 4 -; GFX8-NEXT: v_bfe_u32 v9, v0, 12, 4 -; GFX8-NEXT: v_bfe_u32 v11, v0, 16, 4 -; GFX8-NEXT: v_bfe_u32 v13, v0, 20, 4 -; GFX8-NEXT: v_bfe_u32 v14, v3, 24, 4 -; GFX8-NEXT: v_bfe_u32 v15, v0, 24, 4 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 28, v3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v3, 15, v1 +; GFX8-NEXT: v_bfe_u32 v4, v0, 4, 4 +; GFX8-NEXT: v_bfe_u32 v5, v1, 4, 4 +; GFX8-NEXT: v_bfe_u32 v6, v0, 8, 4 +; GFX8-NEXT: v_bfe_u32 v7, v1, 8, 4 +; GFX8-NEXT: v_bfe_u32 v8, v0, 12, 4 +; GFX8-NEXT: v_bfe_u32 v9, v1, 12, 4 +; GFX8-NEXT: v_bfe_u32 v10, v0, 16, 4 +; GFX8-NEXT: v_bfe_u32 v11, v1, 16, 4 +; GFX8-NEXT: v_bfe_u32 v12, v0, 20, 4 +; GFX8-NEXT: v_bfe_u32 v13, v1, 20, 4 +; GFX8-NEXT: v_bfe_u32 v14, v0, 24, 4 +; GFX8-NEXT: v_bfe_u32 v15, v1, 24, 4 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 28, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, s0 -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v1 +; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, s0 +; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, v2 ; GFX8-NEXT: v_mad_u32_u24 v0, v5, v4, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v7, v6, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v9, v8, v0 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll index f8770642cc006..241a0fd4bfdf3 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll @@ -55,6 +55,7 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() { ; GFX12-NEXT: s_sext_i32_i16 s13, s13 ; GFX12-NEXT: s_add_co_u32 s12, s12, wobble@gotpcrel32@lo+8 ; GFX12-NEXT: s_add_co_ci_u32 s13, s13, wobble@gotpcrel32@hi+16 +; GFX12-NEXT: s_clause 0x2 ; GFX12-NEXT: s_load_u8 s14, s[4:5], 0x0 ; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 ; GFX12-NEXT: s_load_b64 s[6:7], s[12:13], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index e0dacb7a59a42..18bd7db458b70 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1593,6 +1593,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s4, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll index 1c298014e33e7..3b17e84ee3ee1 100644 --- a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll @@ -29,17 +29,16 @@ define amdgpu_gfx [13 x i32] @issue130120() { ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_cmp_eq_u32 s46, 0 ; CHECK-NEXT: s_mov_b32 s49, s48 +; CHECK-NEXT: s_cselect_b32 s47, s45, 0xf0 ; CHECK-NEXT: s_mov_b32 s50, s48 ; CHECK-NEXT: s_cselect_b32 s51, 0, s1 -; CHECK-NEXT: s_cselect_b32 s55, 0, s35 +; CHECK-NEXT: s_cselect_b32 vcc_lo, 0, s43 ; CHECK-NEXT: v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49 ; CHECK-NEXT: s_cselect_b32 s52, 0, s2 -; CHECK-NEXT: s_cselect_b32 s56, 0, s36 -; CHECK-NEXT: s_cselect_b32 vcc_lo, 0, s43 -; CHECK-NEXT: v_mov_b32_e32 v4, s50 -; CHECK-NEXT: s_cselect_b32 s47, s45, 0xf0 ; CHECK-NEXT: s_cselect_b32 s53, 0, s3 ; CHECK-NEXT: s_cselect_b32 s54, 0, s34 +; CHECK-NEXT: s_cselect_b32 s55, 0, s35 +; CHECK-NEXT: s_cselect_b32 s56, 0, s36 ; CHECK-NEXT: s_cselect_b32 s57, 0, s37 ; CHECK-NEXT: s_cselect_b32 s58, 0, s38 ; CHECK-NEXT: s_cselect_b32 s59, 0, s0 @@ -49,6 +48,9 @@ define amdgpu_gfx [13 x i32] @issue130120() { ; CHECK-NEXT: s_cselect_b32 s63, 0, s42 ; CHECK-NEXT: s_cselect_b32 vcc_hi, 0, s44 ; CHECK-NEXT: s_mov_b32 s46, s48 +; CHECK-NEXT: v_mov_b32_e32 v4, s50 +; CHECK-NEXT: s_clause 0xf +; CHECK-NEXT: scratch_store_b32 off, v0, s47 ; CHECK-NEXT: scratch_store_b32 off, v0, s51 ; CHECK-NEXT: scratch_store_b32 off, v0, s52 ; CHECK-NEXT: scratch_store_b32 off, v0, s53 @@ -56,7 +58,6 @@ define amdgpu_gfx [13 x i32] @issue130120() { ; CHECK-NEXT: scratch_store_b32 off, v0, s55 ; CHECK-NEXT: scratch_store_b64 off, v[0:1], s56 ; CHECK-NEXT: scratch_store_b32 off, v0, s57 -; CHECK-NEXT: scratch_store_b32 off, v0, s47 ; CHECK-NEXT: scratch_store_b96 off, v[2:4], s58 ; CHECK-NEXT: scratch_store_b96 off, v[2:4], s59 ; CHECK-NEXT: scratch_store_b32 off, v0, s60 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index 9df995b5a7066..da474d3889413 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -4706,10 +4706,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { ; SI-LABEL: packed_struct_argument_alignment: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_load_dword s2, s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:49 ; SI-NEXT: buffer_load_ubyte v5, off, s[4:7], 0 offset:50 ; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:51 @@ -4754,11 +4754,11 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; VI-NEXT: v_mov_b32_e32 v7, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: s_add_u32 s0, s4, 53 ; VI-NEXT: flat_load_ubyte v8, v[0:1] ; VI-NEXT: flat_load_ubyte v9, v[2:3] ; VI-NEXT: flat_load_ubyte v10, v[4:5] ; VI-NEXT: flat_load_ubyte v6, v[6:7] -; VI-NEXT: s_add_u32 s0, s4, 53 ; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll index 04abb75c3f912..8b22b93cb4102 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll @@ -126,6 +126,7 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) { ; CHECK-NEXT: s_add_u32 s18, s18, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s19, s19, use_module@gotpcrel32@hi+12 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_clause 0x1 ; CHECK-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; CHECK-NEXT: s_load_dword s17, s[8:9], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 @@ -192,6 +193,7 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-NEXT: s_add_u32 s18, s18, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s19, s19, use_module@gotpcrel32@hi+12 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_clause 0x1 ; CHECK-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; CHECK-NEXT: s_load_dword s17, s[8:9], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 @@ -258,6 +260,7 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-NEXT: s_add_u32 s18, s18, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s19, s19, use_module@gotpcrel32@hi+12 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_clause 0x1 ; CHECK-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; CHECK-NEXT: s_load_dword s17, s[8:9], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 @@ -324,6 +327,7 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) ; CHECK-NEXT: s_add_u32 s18, s18, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s19, s19, use_module@gotpcrel32@hi+12 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_clause 0x1 ; CHECK-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; CHECK-NEXT: s_load_dword s17, s[8:9], 0x0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bvh8_intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bvh8_intersect_ray.ll index ff65d5d96cb2c..46209c58fe27b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bvh8_intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bvh8_intersect_ray.ll @@ -13,6 +13,7 @@ define amdgpu_ps <10 x float> @image_bvh8_intersect_ray(i64 %node_ptr, float %ra ; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-SDAG-NEXT: image_bvh8_intersect_ray v[0:9], [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3] ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: global_store_b96 v[10:11], v[16:18], off ; GFX12-SDAG-NEXT: global_store_b96 v[12:13], v[19:21], off ; GFX12-SDAG-NEXT: ; return to shader part epilog @@ -25,6 +26,7 @@ define amdgpu_ps <10 x float> @image_bvh8_intersect_ray(i64 %node_ptr, float %ra ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-GISEL-NEXT: image_bvh8_intersect_ray v[0:9], [v[0:1], v[2:3], v[14:16], v[17:19], v9], s[0:3] ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: global_store_b96 v[10:11], v[14:16], off ; GFX12-GISEL-NEXT: global_store_b96 v[12:13], v[17:19], off ; GFX12-GISEL-NEXT: ; return to shader part epilog @@ -54,6 +56,7 @@ define amdgpu_ps <10 x float> @image_bvh8_intersect_ray_1(i64 %node_ptr, float % ; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 1 ; GFX12-SDAG-NEXT: image_bvh8_intersect_ray v[0:9], [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3] ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: global_store_b96 v[10:11], v[16:18], off ; GFX12-SDAG-NEXT: global_store_b96 v[12:13], v[19:21], off ; GFX12-SDAG-NEXT: ; return to shader part epilog @@ -66,6 +69,7 @@ define amdgpu_ps <10 x float> @image_bvh8_intersect_ray_1(i64 %node_ptr, float % ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 1 ; GFX12-GISEL-NEXT: image_bvh8_intersect_ray v[0:9], [v[0:1], v[2:3], v[14:16], v[17:19], v9], s[0:3] ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: global_store_b96 v[10:11], v[14:16], off ; GFX12-GISEL-NEXT: global_store_b96 v[12:13], v[17:19], off ; GFX12-GISEL-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll index 4e61cb4831545..010e0faf906e7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll @@ -221,50 +221,6 @@ define %trivial_types @dead_struct(i1 %cond, %trivial_types %x, ptr addrspace(1) ; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v12, v13 :: v_dual_mov_b32 v13, v14 ; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v14, v15 :: v_dual_mov_b32 v15, v16 ; ASM-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] -; ASM-GISEL-LABEL: dead_struct: -; ASM-GISEL: ; %bb.0: ; %entry -; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; ASM-GISEL-NEXT: s_wait_expcnt 0x0 -; ASM-GISEL-NEXT: s_wait_samplecnt 0x0 -; ASM-GISEL-NEXT: s_wait_bvhcnt 0x0 -; ASM-GISEL-NEXT: s_wait_kmcnt 0x0 -; ASM-GISEL-NEXT: v_mov_b32_e32 v20, v0 -; ASM-GISEL-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v2 -; ASM-GISEL-NEXT: s_mov_b32 s0, exec_lo -; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; ASM-GISEL-NEXT: v_and_b32_e32 v2, 1, v20 -; ASM-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v2 -; ASM-GISEL-NEXT: s_cbranch_execz .LBB1_2 -; ASM-GISEL-NEXT: ; %bb.1: ; %if.then -; ASM-GISEL-NEXT: s_mov_b32 s4, 0 -; ASM-GISEL-NEXT: s_mov_b32 s1, 0x3fc00000 -; ASM-GISEL-NEXT: s_wait_alu 0xfffe -; ASM-GISEL-NEXT: s_mov_b32 s7, s4 -; ASM-GISEL-NEXT: s_mov_b32 s5, s4 -; ASM-GISEL-NEXT: s_mov_b32 s6, s4 -; ASM-GISEL-NEXT: s_wait_alu 0xfffe -; ASM-GISEL-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v13, s6 -; ASM-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_add_nc_u32 v0, 15, v19 -; ASM-GISEL-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v11, s4 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr8 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr9_vgpr10 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr15 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr16 -; ASM-GISEL-NEXT: global_store_b32 v[17:18], v0, off -; ASM-GISEL-NEXT: ; implicit-def: $vgpr0 -; ASM-GISEL-NEXT: .LBB1_2: ; %if.end -; ASM-GISEL-NEXT: s_wait_alu 0xfffe -; ASM-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; ASM-GISEL-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4 -; ASM-GISEL-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 -; ASM-GISEL-NEXT: v_dual_mov_b32 v6, v7 :: v_dual_mov_b32 v7, v8 -; ASM-GISEL-NEXT: v_dual_mov_b32 v8, v9 :: v_dual_mov_b32 v9, v10 -; ASM-GISEL-NEXT: v_dual_mov_b32 v10, v11 :: v_dual_mov_b32 v11, v12 -; ASM-GISEL-NEXT: v_dual_mov_b32 v12, v13 :: v_dual_mov_b32 v13, v14 -; ASM-GISEL-NEXT: v_dual_mov_b32 v14, v15 :: v_dual_mov_b32 v15, v16 -; ASM-GISEL-NEXT: s_setpc_b64 s[30:31] entry: br i1 %cond, label %if.then, label %if.end @@ -294,10 +250,10 @@ define [32 x i32] @dead_array(i1 %cond, [32 x i32] %x, ptr addrspace(1) %ptr1, i ; ASM-DAG-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v0 ; ASM-DAG-NEXT: v_mov_b32_e32 v0, v1 ; ASM-DAG-NEXT: s_clause 0x4 -; ASM-DAG-NEXT: scratch_load_b32 v35, off, s32 offset:12 ; ASM-DAG-NEXT: scratch_load_b32 v34, off, s32 offset:8 ; ASM-DAG-NEXT: scratch_load_b32 v31, off, s32 offset:4 ; ASM-DAG-NEXT: scratch_load_b32 v30, off, s32 +; ASM-DAG-NEXT: scratch_load_b32 v35, off, s32 offset:12 ; ASM-DAG-NEXT: scratch_load_b32 v1, off, s32 offset:16 ; ASM-DAG-NEXT: s_mov_b32 s0, exec_lo ; ASM-DAG-NEXT: v_and_b32_e32 v33, 1, v33 @@ -515,83 +471,6 @@ define [32 x i32] @dead_array(i1 %cond, [32 x i32] %x, ptr addrspace(1) %ptr1, i ; ASM-GISEL-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; ASM-GISEL-FAKE16-NEXT: s_wait_loadcnt 0x0 ; ASM-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] -; ASM-GISEL-LABEL: dead_array: -; ASM-GISEL: ; %bb.0: ; %entry -; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; ASM-GISEL-NEXT: s_wait_expcnt 0x0 -; ASM-GISEL-NEXT: s_wait_samplecnt 0x0 -; ASM-GISEL-NEXT: s_wait_bvhcnt 0x0 -; ASM-GISEL-NEXT: s_wait_kmcnt 0x0 -; ASM-GISEL-NEXT: v_mov_b32_e32 v32, v0 -; ASM-GISEL-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v2 -; ASM-GISEL-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4 -; ASM-GISEL-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 -; ASM-GISEL-NEXT: v_dual_mov_b32 v6, v7 :: v_dual_mov_b32 v7, v8 -; ASM-GISEL-NEXT: v_dual_mov_b32 v8, v9 :: v_dual_mov_b32 v9, v10 -; ASM-GISEL-NEXT: v_dual_mov_b32 v10, v11 :: v_dual_mov_b32 v11, v12 -; ASM-GISEL-NEXT: v_dual_mov_b32 v12, v13 :: v_dual_mov_b32 v13, v14 -; ASM-GISEL-NEXT: v_dual_mov_b32 v14, v15 :: v_dual_mov_b32 v15, v16 -; ASM-GISEL-NEXT: v_dual_mov_b32 v16, v17 :: v_dual_mov_b32 v17, v18 -; ASM-GISEL-NEXT: v_dual_mov_b32 v18, v19 :: v_dual_mov_b32 v19, v20 -; ASM-GISEL-NEXT: v_dual_mov_b32 v20, v21 :: v_dual_mov_b32 v21, v22 -; ASM-GISEL-NEXT: v_dual_mov_b32 v22, v23 :: v_dual_mov_b32 v23, v24 -; ASM-GISEL-NEXT: v_dual_mov_b32 v24, v25 :: v_dual_mov_b32 v25, v26 -; ASM-GISEL-NEXT: v_dual_mov_b32 v26, v27 :: v_dual_mov_b32 v27, v28 -; ASM-GISEL-NEXT: v_dual_mov_b32 v28, v29 :: v_dual_mov_b32 v29, v30 -; ASM-GISEL-NEXT: s_clause 0x4 -; ASM-GISEL-NEXT: scratch_load_b32 v30, off, s32 -; ASM-GISEL-NEXT: scratch_load_b32 v31, off, s32 offset:4 -; ASM-GISEL-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; ASM-GISEL-NEXT: scratch_load_b32 v34, off, s32 offset:12 -; ASM-GISEL-NEXT: scratch_load_b32 v35, off, s32 offset:16 -; ASM-GISEL-NEXT: v_and_b32_e32 v32, 1, v32 -; ASM-GISEL-NEXT: s_mov_b32 s0, exec_lo -; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ASM-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v32 -; ASM-GISEL-NEXT: s_cbranch_execz .LBB2_2 -; ASM-GISEL-NEXT: ; %bb.1: ; %if.then -; ASM-GISEL-NEXT: s_mov_b32 s1, 15 -; ASM-GISEL-NEXT: s_mov_b32 s2, 13 -; ASM-GISEL-NEXT: s_wait_loadcnt 0x0 -; ASM-GISEL-NEXT: s_wait_alu 0xfffe -; ASM-GISEL-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_add_nc_u32 v0, 15, v35 -; ASM-GISEL-NEXT: v_mov_b32_e32 v6, s2 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr1 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr2 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr3 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr4 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr5 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr8 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr9 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr10 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr11 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr12 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr13 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr14 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr15 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr16 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr17 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr18 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr19 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr20 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr21 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr22 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr23 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr24 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr25 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr26 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr27 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr28 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr29 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr30 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr31 -; ASM-GISEL-NEXT: global_store_b32 v[33:34], v0, off -; ASM-GISEL-NEXT: ; implicit-def: $vgpr0 -; ASM-GISEL-NEXT: .LBB2_2: ; %if.end -; ASM-GISEL-NEXT: s_wait_alu 0xfffe -; ASM-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; ASM-GISEL-NEXT: s_wait_loadcnt 0x0 -; ASM-GISEL-NEXT: s_setpc_b64 s[30:31] entry: br i1 %cond, label %if.then, label %if.end @@ -912,155 +791,6 @@ define %non_trivial_types @dead_non_trivial(i1 %cond, %non_trivial_types %x, ptr ; ASM-GISEL-FAKE16-NEXT: scratch_store_b32 v0, v67, off offset:204 ; ASM-GISEL-FAKE16-NEXT: s_wait_loadcnt 0x0 ; ASM-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] -; ASM-GISEL-LABEL: dead_non_trivial: -; ASM-GISEL: ; %bb.0: ; %entry -; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; ASM-GISEL-NEXT: s_wait_expcnt 0x0 -; ASM-GISEL-NEXT: s_wait_samplecnt 0x0 -; ASM-GISEL-NEXT: s_wait_bvhcnt 0x0 -; ASM-GISEL-NEXT: s_wait_kmcnt 0x0 -; ASM-GISEL-NEXT: s_clause 0x15 -; ASM-GISEL-NEXT: scratch_load_b32 v33, off, s32 -; ASM-GISEL-NEXT: scratch_load_b32 v34, off, s32 offset:4 -; ASM-GISEL-NEXT: scratch_load_b32 v35, off, s32 offset:8 -; ASM-GISEL-NEXT: scratch_load_b32 v36, off, s32 offset:12 -; ASM-GISEL-NEXT: scratch_load_b32 v37, off, s32 offset:16 -; ASM-GISEL-NEXT: scratch_load_b32 v38, off, s32 offset:20 -; ASM-GISEL-NEXT: scratch_load_b32 v39, off, s32 offset:24 -; ASM-GISEL-NEXT: scratch_load_b32 v48, off, s32 offset:28 -; ASM-GISEL-NEXT: scratch_load_b32 v49, off, s32 offset:32 -; ASM-GISEL-NEXT: scratch_load_b32 v50, off, s32 offset:36 -; ASM-GISEL-NEXT: scratch_load_b32 v51, off, s32 offset:40 -; ASM-GISEL-NEXT: scratch_load_b32 v52, off, s32 offset:44 -; ASM-GISEL-NEXT: scratch_load_b32 v53, off, s32 offset:48 -; ASM-GISEL-NEXT: scratch_load_b32 v54, off, s32 offset:52 -; ASM-GISEL-NEXT: scratch_load_b32 v55, off, s32 offset:56 -; ASM-GISEL-NEXT: scratch_load_b32 v64, off, s32 offset:60 -; ASM-GISEL-NEXT: scratch_load_b32 v65, off, s32 offset:64 -; ASM-GISEL-NEXT: scratch_load_b32 v66, off, s32 offset:68 -; ASM-GISEL-NEXT: scratch_load_b32 v67, off, s32 offset:72 -; ASM-GISEL-NEXT: scratch_load_b32 v31, off, s32 offset:76 -; ASM-GISEL-NEXT: scratch_load_b32 v32, off, s32 offset:80 -; ASM-GISEL-NEXT: scratch_load_b32 v68, off, s32 offset:84 -; ASM-GISEL-NEXT: v_and_b32_e32 v1, 1, v1 -; ASM-GISEL-NEXT: s_mov_b32 s0, exec_lo -; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; ASM-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v1 -; ASM-GISEL-NEXT: s_cbranch_execz .LBB3_2 -; ASM-GISEL-NEXT: ; %bb.1: ; %if.then -; ASM-GISEL-NEXT: s_mov_b32 s1, 0 -; ASM-GISEL-NEXT: s_movk_i32 s2, 0x3e00 -; ASM-GISEL-NEXT: s_wait_loadcnt 0x0 -; ASM-GISEL-NEXT: s_wait_alu 0xfffe -; ASM-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_add_nc_u32 v1, 15, v68 -; ASM-GISEL-NEXT: v_mov_b32_e32 v8, s1 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr2 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr3 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr5 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr6 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr7 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12_vgpr13 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr18 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr19 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr20 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr21 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr22 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr23 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr24 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr25 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr26 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr27 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr28 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr29 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr30 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr33 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr34 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr35 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr36 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr37 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr38 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr39 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr48 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr49 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr50 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr51 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr52 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr53 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr54 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr55 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr64 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr65 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr66 -; ASM-GISEL-NEXT: ; implicit-def: $vgpr67 -; ASM-GISEL-NEXT: global_store_b32 v[31:32], v1, off -; ASM-GISEL-NEXT: .LBB3_2: ; %if.end -; ASM-GISEL-NEXT: s_wait_alu 0xfffe -; ASM-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; ASM-GISEL-NEXT: s_clause 0x16 -; ASM-GISEL-NEXT: scratch_store_b8 v0, v2, off -; ASM-GISEL-NEXT: scratch_store_b16 v0, v3, off offset:2 -; ASM-GISEL-NEXT: scratch_store_b16 v0, v4, off offset:4 -; ASM-GISEL-NEXT: scratch_store_b16 v0, v5, off offset:6 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v6, off offset:8 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v7, off offset:12 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v8, off offset:16 -; ASM-GISEL-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v13, off offset:48 -; ASM-GISEL-NEXT: scratch_store_b128 v0, v[14:17], off offset:64 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v18, off offset:80 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v19, off offset:84 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v20, off offset:88 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v21, off offset:92 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v22, off offset:96 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v23, off offset:100 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v24, off offset:104 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v25, off offset:108 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v26, off offset:112 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v27, off offset:116 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v28, off offset:120 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v29, off offset:124 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v30, off offset:128 -; ASM-GISEL-NEXT: s_wait_loadcnt 0x15 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v33, off offset:132 -; ASM-GISEL-NEXT: s_wait_loadcnt 0x14 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v34, off offset:136 -; ASM-GISEL-NEXT: s_wait_loadcnt 0x13 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v35, off offset:140 -; ASM-GISEL-NEXT: s_wait_loadcnt 0x12 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v36, off offset:144 -; ASM-GISEL-NEXT: s_wait_loadcnt 0x11 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v37, off offset:148 -; ASM-GISEL-NEXT: s_wait_loadcnt 0x10 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v38, off offset:152 -; ASM-GISEL-NEXT: s_wait_loadcnt 0xf -; ASM-GISEL-NEXT: scratch_store_b32 v0, v39, off offset:156 -; ASM-GISEL-NEXT: s_wait_loadcnt 0xe -; ASM-GISEL-NEXT: scratch_store_b32 v0, v48, off offset:160 -; ASM-GISEL-NEXT: s_wait_loadcnt 0xd -; ASM-GISEL-NEXT: scratch_store_b32 v0, v49, off offset:164 -; ASM-GISEL-NEXT: s_wait_loadcnt 0xc -; ASM-GISEL-NEXT: scratch_store_b32 v0, v50, off offset:168 -; ASM-GISEL-NEXT: s_wait_loadcnt 0xb -; ASM-GISEL-NEXT: scratch_store_b32 v0, v51, off offset:172 -; ASM-GISEL-NEXT: s_wait_loadcnt 0xa -; ASM-GISEL-NEXT: scratch_store_b32 v0, v52, off offset:176 -; ASM-GISEL-NEXT: s_wait_loadcnt 0x9 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v53, off offset:180 -; ASM-GISEL-NEXT: s_wait_loadcnt 0x8 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v54, off offset:184 -; ASM-GISEL-NEXT: s_wait_loadcnt 0x7 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v55, off offset:188 -; ASM-GISEL-NEXT: s_wait_loadcnt 0x6 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v64, off offset:192 -; ASM-GISEL-NEXT: s_wait_loadcnt 0x5 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v65, off offset:196 -; ASM-GISEL-NEXT: s_wait_loadcnt 0x4 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v66, off offset:200 -; ASM-GISEL-NEXT: s_wait_loadcnt 0x3 -; ASM-GISEL-NEXT: scratch_store_b32 v0, v67, off offset:204 -; ASM-GISEL-NEXT: s_wait_loadcnt 0x0 -; ASM-GISEL-NEXT: s_setpc_b64 s[30:31] entry: br i1 %cond, label %if.then, label %if.end diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll index 7e22d60cd710f..e8ca34fe20677 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll @@ -16,6 +16,7 @@ define amdgpu_ps <10 x float> @image_bvh_dual_intersect_ray(i64 %node_ptr, float ; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-SDAG-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[17:19], v[20:22], v[9:10]], s[0:3] ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: global_store_b96 v[11:12], v[17:19], off ; GFX12-SDAG-NEXT: global_store_b96 v[13:14], v[20:22], off ; GFX12-SDAG-NEXT: ; return to shader part epilog @@ -28,6 +29,7 @@ define amdgpu_ps <10 x float> @image_bvh_dual_intersect_ray(i64 %node_ptr, float ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-GISEL-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[15:17], v[18:20], v[9:10]], s[0:3] ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: global_store_b96 v[11:12], v[15:17], off ; GFX12-GISEL-NEXT: global_store_b96 v[13:14], v[18:20], off ; GFX12-GISEL-NEXT: ; return to shader part epilog @@ -57,6 +59,7 @@ define amdgpu_ps <10 x float> @image_bvh_dual_intersect_ray_1(i64 %node_ptr, flo ; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 1 ; GFX12-SDAG-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[17:19], v[20:22], v[9:10]], s[0:3] ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: global_store_b96 v[11:12], v[17:19], off ; GFX12-SDAG-NEXT: global_store_b96 v[13:14], v[20:22], off ; GFX12-SDAG-NEXT: ; return to shader part epilog @@ -69,6 +72,7 @@ define amdgpu_ps <10 x float> @image_bvh_dual_intersect_ray_1(i64 %node_ptr, flo ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 1 ; GFX12-GISEL-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[15:17], v[18:20], v[9:10]], s[0:3] ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: global_store_b96 v[11:12], v[15:17], off ; GFX12-GISEL-NEXT: global_store_b96 v[13:14], v[18:20], off ; GFX12-GISEL-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll index addb395eccf11..ca5a70f18a581 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll @@ -14,6 +14,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16( ; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] +; SDAG-GFX11-TRUE16-NEXT: s_clause 0x1 ; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0 ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -27,6 +28,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16( ; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] +; SDAG-GFX11-FAKE16-NEXT: s_clause 0x1 ; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0 ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -51,6 +53,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16_dpp( ; SDAG-GFX11-TRUE16: ; %bb.0: ; %entry ; SDAG-GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: s_clause 0x2 ; SDAG-GFX11-TRUE16-NEXT: scratch_load_b32 v1, off, s1 ; SDAG-GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s2 ; SDAG-GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s3 @@ -66,6 +69,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16_dpp( ; SDAG-GFX11-FAKE16: ; %bb.0: ; %entry ; SDAG-GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: s_clause 0x2 ; SDAG-GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s2 ; SDAG-GFX11-FAKE16-NEXT: scratch_load_u16 v1, off, s3 ; SDAG-GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll index 19e03486d122d..9c7fbf49f72a2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll @@ -13,6 +13,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] +; SDAG-GFX11-TRUE16-NEXT: s_clause 0x1 ; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0 ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -26,6 +27,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] +; SDAG-GFX11-FAKE16-NEXT: s_clause 0x1 ; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0 ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -39,6 +41,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; GISEL-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] +; GISEL-GFX11-TRUE16-NEXT: s_clause 0x1 ; GISEL-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GISEL-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0 ; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -52,6 +55,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; GISEL-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] +; GISEL-GFX11-FAKE16-NEXT: s_clause 0x1 ; GISEL-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GISEL-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0 ; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -76,6 +80,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp( ; SDAG-GFX11-TRUE16: ; %bb.0: ; %entry ; SDAG-GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: s_clause 0x2 ; SDAG-GFX11-TRUE16-NEXT: scratch_load_b32 v1, off, s1 ; SDAG-GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s2 ; SDAG-GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s3 @@ -91,6 +96,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp( ; SDAG-GFX11-FAKE16: ; %bb.0: ; %entry ; SDAG-GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: s_clause 0x2 ; SDAG-GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s2 ; SDAG-GFX11-FAKE16-NEXT: scratch_load_u16 v1, off, s3 ; SDAG-GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s1 @@ -103,6 +109,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp( ; GISEL-GFX11-TRUE16: ; %bb.0: ; %entry ; GISEL-GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-TRUE16-NEXT: s_clause 0x2 ; GISEL-GFX11-TRUE16-NEXT: scratch_load_b32 v1, off, s1 ; GISEL-GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s2 ; GISEL-GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s3 @@ -118,6 +125,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp( ; GISEL-GFX11-FAKE16: ; %bb.0: ; %entry ; GISEL-GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GISEL-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-FAKE16-NEXT: s_clause 0x2 ; GISEL-GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s1 ; GISEL-GFX11-FAKE16-NEXT: scratch_load_b32 v1, off, s2 ; GISEL-GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll index 159592cab6a34..284f3bf585d4e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -11,6 +11,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp( ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 @@ -71,6 +72,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp( ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll index 4a735a727229b..7d2cfffde7d14 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll @@ -49,10 +49,7 @@ define amdgpu_kernel void @mad_f32_imm_b( ; GCN-LABEL: {{^}}mad_f32_imm_c: ; GCN: v_mov_b32_e32 [[C:v[0-9]+]], 0x41000000 -; GCN: s_load_dword [[B:s[0-9]+]] -; GCN: s_load_dword [[A:s[0-9]+]] -; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] -; GCN: v_mac_f32_e32 [[C]], {{s[0-9]+}}, [[VB]]{{$}} +; GCN: v_mac_f32_e32 [[C]], {{s[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @mad_f32_imm_c( ptr addrspace(1) %r, ptr addrspace(1) %a, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index 4fa4b73456ecd..44b25d06876be 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -422,6 +422,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s9, 0, s0 ; GFX1013-NEXT: v_add_co_u32 v4, s0, s10, v0 ; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s11, 0, s0 +; GFX1013-NEXT: s_clause 0x1 ; GFX1013-NEXT: flat_load_dword v0, v[2:3] ; GFX1013-NEXT: flat_load_dword v1, v[4:5] ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 @@ -450,6 +451,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX1030-NEXT: v_add_co_u32 v2, s0, s2, v2 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 +; GFX1030-NEXT: s_clause 0x1 ; GFX1030-NEXT: flat_load_dword v0, v[0:1] ; GFX1030-NEXT: flat_load_dword v1, v[2:3] ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 @@ -475,6 +477,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX11-NEXT: v_add_co_u32 v2, s0, s2, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] ; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000 @@ -503,6 +506,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX12-SDAG-NEXT: v_add_co_u32 v2, s0, s2, v2 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 +; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: flat_load_b32 v9, v[0:1] ; GFX12-SDAG-NEXT: flat_load_b32 v10, v[2:3] ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x40e00000 @@ -540,6 +544,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: flat_load_b32 v9, v[0:1] ; GFX12-GISEL-NEXT: flat_load_b32 v10, v[2:3] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe @@ -583,6 +588,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s9, 0, s0 ; GFX1013-NEXT: v_add_co_u32 v4, s0, s10, v0 ; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s11, 0, s0 +; GFX1013-NEXT: s_clause 0x1 ; GFX1013-NEXT: flat_load_dword v0, v[2:3] ; GFX1013-NEXT: flat_load_dword v1, v[4:5] ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 @@ -608,6 +614,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX1030-NEXT: v_add_co_u32 v2, s0, s2, v2 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 +; GFX1030-NEXT: s_clause 0x1 ; GFX1030-NEXT: flat_load_dword v0, v[0:1] ; GFX1030-NEXT: flat_load_dword v1, v[2:3] ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 @@ -631,6 +638,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX11-NEXT: v_add_co_u32 v2, s0, s2, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] ; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 @@ -656,6 +664,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX12-SDAG-NEXT: v_add_co_u32 v2, s0, s2, v2 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 +; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: flat_load_b32 v6, v[0:1] ; GFX12-SDAG-NEXT: flat_load_b32 v7, v[2:3] ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x47004400 @@ -688,6 +697,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: flat_load_b32 v6, v[0:1] ; GFX12-GISEL-NEXT: flat_load_b32 v7, v[2:3] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll index 0fe371c1b51fe..130616ed030ef 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll @@ -52,23 +52,24 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b32 s13, s15 ; GCN-NEXT: s_mov_b32 s12, s14 -; GCN-NEXT: s_load_dwordx2 s[18:19], s[8:9], 0x0 -; GCN-NEXT: s_add_u32 s8, s8, 8 +; GCN-NEXT: s_add_u32 s18, s8, 8 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_addc_u32 s19, s9, 0 ; GCN-NEXT: s_getpc_b64 s[14:15] ; GCN-NEXT: s_add_u32 s14, s14, function_lds_id@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s15, s15, function_lds_id@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[20:21], s[14:15], 0x0 +; GCN-NEXT: s_load_dwordx2 s[20:21], s[8:9], 0x0 +; GCN-NEXT: s_load_dwordx2 s[22:23], s[14:15], 0x0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: v_or_b32_e32 v31, v0, v2 ; GCN-NEXT: s_mov_b32 s15, 21 +; GCN-NEXT: s_mov_b64 s[8:9], s[18:19] ; GCN-NEXT: s_mov_b32 s14, s16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NEXT: s_swappc_b64 s[30:31], s[20:21] +; GCN-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GCN-NEXT: s_endpgm call void @function_lds_id(ptr addrspace(1) %out) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index dac54c9f85e96..a861757c901b2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -17,8 +17,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 @@ -114,8 +114,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 @@ -211,8 +211,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 @@ -308,8 +308,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 @@ -405,8 +405,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 @@ -502,8 +502,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 @@ -599,8 +599,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 @@ -696,8 +696,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 @@ -845,8 +845,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 @@ -1294,8 +1294,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 @@ -1442,8 +1442,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 @@ -5332,8 +5332,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6( ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 @@ -5429,8 +5429,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8( ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 @@ -5526,8 +5526,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6( ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 @@ -5670,8 +5670,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 @@ -5767,8 +5767,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 @@ -5966,8 +5966,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8 +; SDAG-NEXT: scratch_load_dword a15, off, s32 ; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll index 6e24717a2827d..23781a6585c2c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll @@ -49,6 +49,7 @@ define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX11-NEXT: v_mov_b32_e32 v5, v4 ; GFX11-NEXT: buffer_load_u8 v[4:5], off, s[0:3], 0 tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b8 v[0:1], v4, off ; GFX11-NEXT: global_store_b32 v[2:3], v5, off ; GFX11-NEXT: s_endpgm @@ -60,6 +61,7 @@ define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX12-NEXT: v_mov_b32_e32 v5, v4 ; GFX12-NEXT: buffer_load_u8 v[4:5], off, s[0:3], null tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b8 v[0:1], v4, off ; GFX12-NEXT: global_store_b32 v[2:3], v5, off ; GFX12-NEXT: s_endpgm @@ -113,6 +115,7 @@ define amdgpu_ps void @raw_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr addrsp ; GFX11-NEXT: v_mov_b32_e32 v5, v4 ; GFX11-NEXT: buffer_load_u16 v[4:5], off, s[0:3], 0 tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b16 v[0:1], v4, off ; GFX11-NEXT: global_store_b32 v[2:3], v5, off ; GFX11-NEXT: s_endpgm @@ -124,6 +127,7 @@ define amdgpu_ps void @raw_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr addrsp ; GFX12-NEXT: v_mov_b32_e32 v5, v4 ; GFX12-NEXT: buffer_load_u16 v[4:5], off, s[0:3], null tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b16 v[0:1], v4, off ; GFX12-NEXT: global_store_b32 v[2:3], v5, off ; GFX12-NEXT: s_endpgm @@ -177,6 +181,7 @@ define amdgpu_ps void @raw_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr addrsp ; GFX11-NEXT: v_mov_b32_e32 v5, v4 ; GFX11-NEXT: buffer_load_u16 v[4:5], off, s[0:3], 0 tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b16 v[0:1], v4, off ; GFX11-NEXT: global_store_b32 v[2:3], v5, off ; GFX11-NEXT: s_endpgm @@ -188,6 +193,7 @@ define amdgpu_ps void @raw_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr addrsp ; GFX12-NEXT: v_mov_b32_e32 v5, v4 ; GFX12-NEXT: buffer_load_u16 v[4:5], off, s[0:3], null tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b16 v[0:1], v4, off ; GFX12-NEXT: global_store_b32 v[2:3], v5, off ; GFX12-NEXT: s_endpgm @@ -241,6 +247,7 @@ define amdgpu_ps void @raw_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrsp ; GFX11-NEXT: v_mov_b32_e32 v5, v4 ; GFX11-NEXT: buffer_load_b32 v[4:5], off, s[0:3], 0 tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off ; GFX11-NEXT: global_store_b32 v[2:3], v5, off ; GFX11-NEXT: s_endpgm @@ -252,6 +259,7 @@ define amdgpu_ps void @raw_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrsp ; GFX12-NEXT: v_mov_b32_e32 v5, v4 ; GFX12-NEXT: buffer_load_b32 v[4:5], off, s[0:3], null tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b32 v[0:1], v4, off ; GFX12-NEXT: global_store_b32 v[2:3], v5, off ; GFX12-NEXT: s_endpgm @@ -325,6 +333,7 @@ define amdgpu_ps void @raw_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_load_b64 v[4:6], off, s[0:3], 0 tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off ; GFX11-NEXT: global_store_b32 v[2:3], v6, off ; GFX11-NEXT: s_endpgm @@ -336,6 +345,7 @@ define amdgpu_ps void @raw_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: buffer_load_b64 v[4:6], off, s[0:3], null tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off ; GFX12-NEXT: global_store_b32 v[2:3], v6, off ; GFX12-NEXT: s_endpgm @@ -409,6 +419,7 @@ define amdgpu_ps void @raw_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_load_b64 v[4:6], off, s[0:3], 0 tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off ; GFX11-NEXT: global_store_b32 v[2:3], v6, off ; GFX11-NEXT: s_endpgm @@ -420,6 +431,7 @@ define amdgpu_ps void @raw_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: buffer_load_b64 v[4:6], off, s[0:3], null tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off ; GFX12-NEXT: global_store_b32 v[2:3], v6, off ; GFX12-NEXT: s_endpgm @@ -498,6 +510,7 @@ define amdgpu_ps void @raw_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: v_mov_b32_e32 v7, v4 ; GFX11-NEXT: buffer_load_b96 v[4:7], off, s[0:3], 0 tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b96 v[0:1], v[4:6], off ; GFX11-NEXT: global_store_b32 v[2:3], v7, off ; GFX11-NEXT: s_endpgm @@ -510,6 +523,7 @@ define amdgpu_ps void @raw_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX12-NEXT: v_mov_b32_e32 v7, v4 ; GFX12-NEXT: buffer_load_b96 v[4:7], off, s[0:3], null tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b96 v[0:1], v[4:6], off ; GFX12-NEXT: global_store_b32 v[2:3], v7, off ; GFX12-NEXT: s_endpgm @@ -588,6 +602,7 @@ define amdgpu_ps void @raw_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: v_mov_b32_e32 v7, v4 ; GFX11-NEXT: buffer_load_b96 v[4:7], off, s[0:3], 0 tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b96 v[0:1], v[4:6], off ; GFX11-NEXT: global_store_b32 v[2:3], v7, off ; GFX11-NEXT: s_endpgm @@ -600,6 +615,7 @@ define amdgpu_ps void @raw_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX12-NEXT: v_mov_b32_e32 v7, v4 ; GFX12-NEXT: buffer_load_b96 v[4:7], off, s[0:3], null tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b96 v[0:1], v[4:6], off ; GFX12-NEXT: global_store_b32 v[2:3], v7, off ; GFX12-NEXT: s_endpgm @@ -665,6 +681,7 @@ define amdgpu_ps void @raw_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: v_mov_b32_e32 v8, v4 ; GFX11-NEXT: buffer_load_b128 v[4:8], off, s[0:3], 0 tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b32 v[2:3], v8, off ; GFX11-NEXT: s_endpgm @@ -677,6 +694,7 @@ define amdgpu_ps void @raw_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 ; GFX12-NEXT: buffer_load_b128 v[4:8], off, s[0:3], null tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX12-NEXT: global_store_b32 v[2:3], v8, off ; GFX12-NEXT: s_endpgm @@ -742,6 +760,7 @@ define amdgpu_ps void @raw_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: v_mov_b32_e32 v8, v4 ; GFX11-NEXT: buffer_load_b128 v[4:8], off, s[0:3], 0 tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b32 v[2:3], v8, off ; GFX11-NEXT: s_endpgm @@ -754,6 +773,7 @@ define amdgpu_ps void @raw_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 ; GFX12-NEXT: buffer_load_b128 v[4:8], off, s[0:3], null tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX12-NEXT: global_store_b32 v[2:3], v8, off ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll index 09abebd638611..d0a38ae1add7d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll @@ -51,11 +51,11 @@ define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) { ; CHECK-LABEL: test_softwqm1: ; CHECK: ; %bb.0: ; %main_body ; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v2, s1 -; CHECK-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen -; CHECK-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_add_f32_e32 v1, v1, v2 +; CHECK-NEXT: v_add_f32_e32 v1, v2, v1 ; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen ; CHECK-NEXT: v_add_f32_e32 v0, v1, v1 ; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec @@ -79,11 +79,11 @@ define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) { ; CHECK-NEXT: s_mov_b64 s[2:3], exec ; CHECK-NEXT: s_wqm_b64 exec, exec ; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v2, s1 -; CHECK-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen -; CHECK-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_add_f32_e32 v1, v1, v2 +; CHECK-NEXT: v_add_f32_e32 v1, v2, v1 ; CHECK-NEXT: v_mov_b32_e32 v2, v1 ; CHECK-NEXT: v_add_f32_e32 v1, v1, v1 ; CHECK-NEXT: s_and_b64 exec, exec, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll index 60c04749c9b74..f441b0852ff61 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll @@ -49,6 +49,7 @@ define amdgpu_ps void @struct_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: v_mov_b32_e32 v5, v4 ; GFX11-NEXT: buffer_load_u8 v[4:5], v4, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b8 v[0:1], v4, off ; GFX11-NEXT: global_store_b32 v[2:3], v5, off ; GFX11-NEXT: s_endpgm @@ -60,6 +61,7 @@ define amdgpu_ps void @struct_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX12-NEXT: v_mov_b32_e32 v5, v4 ; GFX12-NEXT: buffer_load_u8 v[4:5], v4, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b8 v[0:1], v4, off ; GFX12-NEXT: global_store_b32 v[2:3], v5, off ; GFX12-NEXT: s_endpgm @@ -113,6 +115,7 @@ define amdgpu_ps void @struct_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr add ; GFX11-NEXT: v_mov_b32_e32 v5, v4 ; GFX11-NEXT: buffer_load_u16 v[4:5], v4, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b16 v[0:1], v4, off ; GFX11-NEXT: global_store_b32 v[2:3], v5, off ; GFX11-NEXT: s_endpgm @@ -124,6 +127,7 @@ define amdgpu_ps void @struct_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr add ; GFX12-NEXT: v_mov_b32_e32 v5, v4 ; GFX12-NEXT: buffer_load_u16 v[4:5], v4, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b16 v[0:1], v4, off ; GFX12-NEXT: global_store_b32 v[2:3], v5, off ; GFX12-NEXT: s_endpgm @@ -177,6 +181,7 @@ define amdgpu_ps void @struct_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr add ; GFX11-NEXT: v_mov_b32_e32 v5, v4 ; GFX11-NEXT: buffer_load_u16 v[4:5], v4, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b16 v[0:1], v4, off ; GFX11-NEXT: global_store_b32 v[2:3], v5, off ; GFX11-NEXT: s_endpgm @@ -188,6 +193,7 @@ define amdgpu_ps void @struct_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr add ; GFX12-NEXT: v_mov_b32_e32 v5, v4 ; GFX12-NEXT: buffer_load_u16 v[4:5], v4, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b16 v[0:1], v4, off ; GFX12-NEXT: global_store_b32 v[2:3], v5, off ; GFX12-NEXT: s_endpgm @@ -241,6 +247,7 @@ define amdgpu_ps void @struct_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr add ; GFX11-NEXT: v_mov_b32_e32 v5, v4 ; GFX11-NEXT: buffer_load_b32 v[4:5], v4, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off ; GFX11-NEXT: global_store_b32 v[2:3], v5, off ; GFX11-NEXT: s_endpgm @@ -252,6 +259,7 @@ define amdgpu_ps void @struct_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr add ; GFX12-NEXT: v_mov_b32_e32 v5, v4 ; GFX12-NEXT: buffer_load_b32 v[4:5], v4, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b32 v[0:1], v4, off ; GFX12-NEXT: global_store_b32 v[2:3], v5, off ; GFX12-NEXT: s_endpgm @@ -325,6 +333,7 @@ define amdgpu_ps void @struct_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_load_b64 v[4:6], v4, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off ; GFX11-NEXT: global_store_b32 v[2:3], v6, off ; GFX11-NEXT: s_endpgm @@ -336,6 +345,7 @@ define amdgpu_ps void @struct_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: buffer_load_b64 v[4:6], v4, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off ; GFX12-NEXT: global_store_b32 v[2:3], v6, off ; GFX12-NEXT: s_endpgm @@ -409,6 +419,7 @@ define amdgpu_ps void @struct_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_load_b64 v[4:6], v4, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off ; GFX11-NEXT: global_store_b32 v[2:3], v6, off ; GFX11-NEXT: s_endpgm @@ -420,6 +431,7 @@ define amdgpu_ps void @struct_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: buffer_load_b64 v[4:6], v4, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off ; GFX12-NEXT: global_store_b32 v[2:3], v6, off ; GFX12-NEXT: s_endpgm @@ -498,6 +510,7 @@ define amdgpu_ps void @struct_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX11-NEXT: v_mov_b32_e32 v7, v4 ; GFX11-NEXT: buffer_load_b96 v[4:7], v4, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b96 v[0:1], v[4:6], off ; GFX11-NEXT: global_store_b32 v[2:3], v7, off ; GFX11-NEXT: s_endpgm @@ -510,6 +523,7 @@ define amdgpu_ps void @struct_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX12-NEXT: v_mov_b32_e32 v7, v4 ; GFX12-NEXT: buffer_load_b96 v[4:7], v4, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b96 v[0:1], v[4:6], off ; GFX12-NEXT: global_store_b32 v[2:3], v7, off ; GFX12-NEXT: s_endpgm @@ -588,6 +602,7 @@ define amdgpu_ps void @struct_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX11-NEXT: v_mov_b32_e32 v7, v4 ; GFX11-NEXT: buffer_load_b96 v[4:7], v4, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b96 v[0:1], v[4:6], off ; GFX11-NEXT: global_store_b32 v[2:3], v7, off ; GFX11-NEXT: s_endpgm @@ -600,6 +615,7 @@ define amdgpu_ps void @struct_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX12-NEXT: v_mov_b32_e32 v7, v4 ; GFX12-NEXT: buffer_load_b96 v[4:7], v4, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b96 v[0:1], v[4:6], off ; GFX12-NEXT: global_store_b32 v[2:3], v7, off ; GFX12-NEXT: s_endpgm @@ -665,6 +681,7 @@ define amdgpu_ps void @struct_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX11-NEXT: v_mov_b32_e32 v8, v4 ; GFX11-NEXT: buffer_load_b128 v[4:8], v4, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b32 v[2:3], v8, off ; GFX11-NEXT: s_endpgm @@ -677,6 +694,7 @@ define amdgpu_ps void @struct_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 ; GFX12-NEXT: buffer_load_b128 v[4:8], v4, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX12-NEXT: global_store_b32 v[2:3], v8, off ; GFX12-NEXT: s_endpgm @@ -742,6 +760,7 @@ define amdgpu_ps void @struct_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX11-NEXT: v_mov_b32_e32 v8, v4 ; GFX11-NEXT: buffer_load_b128 v[4:8], v4, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b32 v[2:3], v8, off ; GFX11-NEXT: s_endpgm @@ -754,6 +773,7 @@ define amdgpu_ps void @struct_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 ; GFX12-NEXT: buffer_load_b128 v[4:8], v4, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX12-NEXT: global_store_b32 v[2:3], v8, off ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll index 947c838740d43..ab85230b9f861 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll @@ -10,6 +10,7 @@ define amdgpu_ps <3 x float> @gather_sample(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX11-LABEL: gather_sample: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: image_sample_lz v2, [v4, v4], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -48,6 +49,7 @@ define amdgpu_ps <3 x float> @sample_gather(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX11-LABEL: sample_gather: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: image_sample_lz v2, [v4, v4], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll index 3874a456590dc..46d662277928c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll @@ -77,10 +77,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[32:39] ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 ; W32-NEXT: global_store_b128 v[40:41], v[44:47], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 ; W32-NEXT: global_store_b128 v[42:43], v[32:35], off ; W32-NEXT: s_endpgm @@ -102,10 +101,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39] ; W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[44:51] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 ; W32-NEXT: global_store_b128 v[40:41], v[44:47], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 ; W32-NEXT: global_store_b128 v[42:43], v[32:35], off ; W32-NEXT: s_endpgm @@ -152,10 +150,9 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[32:39] ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 ; W32-NEXT: global_store_b128 v[40:41], v[44:47], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 ; W32-NEXT: global_store_b128 v[42:43], v[32:35], off ; W32-NEXT: s_endpgm @@ -177,10 +174,9 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39] ; W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[44:51] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 ; W32-NEXT: global_store_b128 v[40:41], v[44:47], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 ; W32-NEXT: global_store_b128 v[42:43], v[32:35], off ; W32-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll index 25adc25d71768..377be223dc442 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll @@ -69,6 +69,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[32:35] ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[36:37], v[40:43], off ; W64-NEXT: global_store_b128 v[38:39], v[32:35], off ; W64-NEXT: s_endpgm @@ -90,6 +91,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x h ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35] ; W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[40:43] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[36:37], v[40:43], off ; W64-NEXT: global_store_b128 v[38:39], v[32:35], off ; W64-NEXT: s_endpgm @@ -132,6 +134,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[32:35] ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[36:37], v[40:43], off ; W64-NEXT: global_store_b128 v[38:39], v[32:35], off ; W64-NEXT: s_endpgm @@ -153,6 +156,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35] ; W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[40:43] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[36:37], v[40:43], off ; W64-NEXT: global_store_b128 v[38:39], v[32:35], off ; W64-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll index 544941b7fa0da..753146d67cea5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -409,32 +409,32 @@ define amdgpu_kernel void @fma_v2f16( ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s18, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 ; VI-NEXT: s_mov_b32 s16, s4 ; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: s_mov_b32 s12, s2 -; VI-NEXT: s_mov_b32 s13, s3 -; VI-NEXT: s_mov_b32 s18, s10 -; VI-NEXT: s_mov_b32 s19, s11 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; VI-NEXT: v_fma_f16 v3, v5, v4, v3 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_fma_f16 v0, v2, v1, v0 +; VI-NEXT: v_fma_f16 v0, v2, v0, v1 ; VI-NEXT: v_or_b32_e32 v0, v0, v3 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm @@ -485,15 +485,15 @@ define amdgpu_kernel void @fma_v2f16_imm_a( ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s2, 0x40400000 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 @@ -505,9 +505,9 @@ define amdgpu_kernel void @fma_v2f16_imm_a( ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_fma_f32 v2, v3, s2, v2 +; SI-NEXT: v_fma_f32 v2, v2, s2, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, v1, s2, v0 +; SI-NEXT: v_fma_f32 v0, v0, s2, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -582,15 +582,15 @@ define amdgpu_kernel void @fma_v2f16_imm_b( ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s2, 0x40400000 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 @@ -602,9 +602,9 @@ define amdgpu_kernel void @fma_v2f16_imm_b( ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_fma_f32 v2, v3, s2, v2 +; SI-NEXT: v_fma_f32 v2, v2, s2, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, v1, s2, v0 +; SI-NEXT: v_fma_f32 v0, v0, s2, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -679,15 +679,15 @@ define amdgpu_kernel void @fma_v2f16_imm_c( ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s2, 0x40400000 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 @@ -699,9 +699,9 @@ define amdgpu_kernel void @fma_v2f16_imm_c( ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_fma_f32 v2, v3, v2, s2 +; SI-NEXT: v_fma_f32 v2, v2, v3, s2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, v1, v0, s2 +; SI-NEXT: v_fma_f32 v0, v0, v1, s2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -835,34 +835,34 @@ define amdgpu_kernel void @fma_v4f16( ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b32 s16, s8 ; VI-NEXT: s_mov_b32 s17, s9 +; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s15, s3 ; VI-NEXT: s_mov_b32 s8, s10 ; VI-NEXT: s_mov_b32 s9, s11 ; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s18, s2 -; VI-NEXT: s_mov_b32 s19, s3 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0 +; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 ; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[12:15], 0 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; VI-NEXT: v_fma_f16 v1, v5, v3, v1 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_fma_f16 v1, v5, v1, v3 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; VI-NEXT: v_fma_f16 v0, v4, v2, v0 +; VI-NEXT: v_fma_f16 v0, v4, v0, v2 ; VI-NEXT: v_fma_f16 v2, v8, v7, v6 ; VI-NEXT: v_fma_f16 v3, v9, v5, v3 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index 61991c8b409dd..f2d708a4696b1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -118,6 +118,7 @@ define amdgpu_kernel void @fmuladd_f16( ; GFX10-FLUSH-NEXT: s_mov_b32 s5, s11 ; GFX10-FLUSH-NEXT: s_mov_b32 s16, s12 ; GFX10-FLUSH-NEXT: s_mov_b32 s17, s13 +; GFX10-FLUSH-NEXT: s_clause 0x1 ; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[16:19], 0 ; GFX10-FLUSH-NEXT: s_mov_b32 s12, s14 @@ -152,6 +153,7 @@ define amdgpu_kernel void @fmuladd_f16( ; GFX10-DENORM-NEXT: s_mov_b32 s17, s13 ; GFX10-DENORM-NEXT: s_mov_b32 s20, s14 ; GFX10-DENORM-NEXT: s_mov_b32 s21, s15 +; GFX10-DENORM-NEXT: s_clause 0x2 ; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[16:19], 0 ; GFX10-DENORM-NEXT: buffer_load_ushort v2, off, s[20:23], 0 @@ -176,6 +178,7 @@ define amdgpu_kernel void @fmuladd_f16( ; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s13, s3 ; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s16, s4 ; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s17, s5 +; GFX11-FLUSH-TRUE16-NEXT: s_clause 0x1 ; GFX11-FLUSH-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 ; GFX11-FLUSH-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 ; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s12, s6 @@ -206,6 +209,7 @@ define amdgpu_kernel void @fmuladd_f16( ; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s13, s3 ; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s16, s4 ; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s17, s5 +; GFX11-FLUSH-FAKE16-NEXT: s_clause 0x1 ; GFX11-FLUSH-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 ; GFX11-FLUSH-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 ; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s4, s6 @@ -241,6 +245,7 @@ define amdgpu_kernel void @fmuladd_f16( ; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s17, s5 ; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s20, s6 ; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s21, s7 +; GFX11-DENORM-TRUE16-NEXT: s_clause 0x2 ; GFX11-DENORM-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 ; GFX11-DENORM-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 ; GFX11-DENORM-TRUE16-NEXT: buffer_load_u16 v2, off, s[20:23], 0 @@ -269,6 +274,7 @@ define amdgpu_kernel void @fmuladd_f16( ; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s17, s5 ; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s20, s6 ; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s21, s7 +; GFX11-DENORM-FAKE16-NEXT: s_clause 0x2 ; GFX11-DENORM-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 ; GFX11-DENORM-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 ; GFX11-DENORM-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], 0 @@ -807,28 +813,28 @@ define amdgpu_kernel void @fmuladd_v2f16( ; VI-FLUSH-NEXT: s_mov_b32 s14, s10 ; VI-FLUSH-NEXT: s_mov_b32 s15, s11 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: s_mov_b32 s12, s2 -; VI-FLUSH-NEXT: s_mov_b32 s13, s3 ; VI-FLUSH-NEXT: s_mov_b32 s16, s4 ; VI-FLUSH-NEXT: s_mov_b32 s17, s5 -; VI-FLUSH-NEXT: s_mov_b32 s18, s10 -; VI-FLUSH-NEXT: s_mov_b32 s19, s11 ; VI-FLUSH-NEXT: s_mov_b32 s4, s6 ; VI-FLUSH-NEXT: s_mov_b32 s5, s7 ; VI-FLUSH-NEXT: s_mov_b32 s6, s10 ; VI-FLUSH-NEXT: s_mov_b32 s7, s11 -; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; VI-FLUSH-NEXT: s_mov_b32 s12, s2 +; VI-FLUSH-NEXT: s_mov_b32 s13, s3 +; VI-FLUSH-NEXT: s_mov_b32 s18, s10 +; VI-FLUSH-NEXT: s_mov_b32 s19, s11 +; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; VI-FLUSH-NEXT: buffer_load_dword v2, off, s[16:19], 0 ; VI-FLUSH-NEXT: s_mov_b32 s8, s0 ; VI-FLUSH-NEXT: s_mov_b32 s9, s1 -; VI-FLUSH-NEXT: s_waitcnt vmcnt(1) -; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(2) +; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: v_mac_f16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-FLUSH-NEXT: v_mac_f16_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-FLUSH-NEXT: v_mac_f16_e32 v1, v0, v2 -; VI-FLUSH-NEXT: v_or_b32_e32 v0, v1, v3 +; VI-FLUSH-NEXT: v_mac_f16_e32 v0, v1, v2 +; VI-FLUSH-NEXT: v_or_b32_e32 v0, v0, v3 ; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-FLUSH-NEXT: s_endpgm ; @@ -838,32 +844,32 @@ define amdgpu_kernel void @fmuladd_v2f16( ; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000 ; VI-DENORM-NEXT: s_mov_b32 s10, -1 ; VI-DENORM-NEXT: s_mov_b32 s14, s10 -; VI-DENORM-NEXT: s_mov_b32 s15, s11 +; VI-DENORM-NEXT: s_mov_b32 s18, s10 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-NEXT: s_mov_b32 s12, s2 +; VI-DENORM-NEXT: s_mov_b32 s13, s3 ; VI-DENORM-NEXT: s_mov_b32 s16, s4 ; VI-DENORM-NEXT: s_mov_b32 s17, s5 +; VI-DENORM-NEXT: s_mov_b32 s19, s11 +; VI-DENORM-NEXT: s_mov_b32 s15, s11 ; VI-DENORM-NEXT: s_mov_b32 s4, s6 ; VI-DENORM-NEXT: s_mov_b32 s5, s7 ; VI-DENORM-NEXT: s_mov_b32 s6, s10 ; VI-DENORM-NEXT: s_mov_b32 s7, s11 -; VI-DENORM-NEXT: s_mov_b32 s12, s2 -; VI-DENORM-NEXT: s_mov_b32 s13, s3 -; VI-DENORM-NEXT: s_mov_b32 s18, s10 -; VI-DENORM-NEXT: s_mov_b32 s19, s11 -; VI-DENORM-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; VI-DENORM-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; VI-DENORM-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-DENORM-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; VI-DENORM-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; VI-DENORM-NEXT: s_mov_b32 s8, s0 ; VI-DENORM-NEXT: s_mov_b32 s9, s1 ; VI-DENORM-NEXT: s_waitcnt vmcnt(2) -; VI-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-DENORM-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; VI-DENORM-NEXT: s_waitcnt vmcnt(1) -; VI-DENORM-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) ; VI-DENORM-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; VI-DENORM-NEXT: v_fma_f16 v3, v5, v4, v3 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-DENORM-NEXT: v_fma_f16 v0, v2, v1, v0 +; VI-DENORM-NEXT: v_fma_f16 v0, v2, v0, v1 ; VI-DENORM-NEXT: v_or_b32_e32 v0, v0, v3 ; VI-DENORM-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-DENORM-NEXT: s_endpgm @@ -882,6 +888,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; GFX10-FLUSH-NEXT: s_mov_b32 s5, s11 ; GFX10-FLUSH-NEXT: s_mov_b32 s16, s12 ; GFX10-FLUSH-NEXT: s_mov_b32 s17, s13 +; GFX10-FLUSH-NEXT: s_clause 0x1 ; GFX10-FLUSH-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; GFX10-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; GFX10-FLUSH-NEXT: s_mov_b32 s12, s14 @@ -916,6 +923,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; GFX10-DENORM-NEXT: s_mov_b32 s17, s13 ; GFX10-DENORM-NEXT: s_mov_b32 s20, s14 ; GFX10-DENORM-NEXT: s_mov_b32 s21, s15 +; GFX10-DENORM-NEXT: s_clause 0x2 ; GFX10-DENORM-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; GFX10-DENORM-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; GFX10-DENORM-NEXT: buffer_load_dword v2, off, s[20:23], 0 @@ -940,6 +948,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3 ; GFX11-FLUSH-NEXT: s_mov_b32 s16, s4 ; GFX11-FLUSH-NEXT: s_mov_b32 s17, s5 +; GFX11-FLUSH-NEXT: s_clause 0x1 ; GFX11-FLUSH-NEXT: buffer_load_b32 v0, off, s[12:15], 0 ; GFX11-FLUSH-NEXT: buffer_load_b32 v1, off, s[16:19], 0 ; GFX11-FLUSH-NEXT: s_mov_b32 s4, s6 @@ -975,6 +984,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; GFX11-DENORM-NEXT: s_mov_b32 s17, s5 ; GFX11-DENORM-NEXT: s_mov_b32 s20, s6 ; GFX11-DENORM-NEXT: s_mov_b32 s21, s7 +; GFX11-DENORM-NEXT: s_clause 0x2 ; GFX11-DENORM-NEXT: buffer_load_b32 v0, off, s[12:15], 0 ; GFX11-DENORM-NEXT: buffer_load_b32 v1, off, s[16:19], 0 ; GFX11-DENORM-NEXT: buffer_load_b32 v2, off, s[20:23], 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll index 3344c73f9eb6f..f3d225204b0ea 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll @@ -2399,7 +2399,6 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:84 ; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:96 ; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:92 -; GFX950-NEXT: scratch_load_dword v31, off, s32 ; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:104 ; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:100 ; GFX950-NEXT: v_accvgpr_write_b32 a11, v58 ; Reload Reuse @@ -2408,21 +2407,22 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: v_accvgpr_write_b32 a14, v61 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a15, v62 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a16, v63 ; Reload Reuse -; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: s_waitcnt vmcnt(24) ; GFX950-NEXT: v_max_f64 v[58:59], v[2:3], v[36:37] ; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[36:37] ; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:112 ; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:108 -; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: s_waitcnt vmcnt(24) ; GFX950-NEXT: v_max_f64 v[60:61], v[4:5], v[38:39] ; GFX950-NEXT: v_cmp_u_f64_e64 s[0:1], v[4:5], v[38:39] ; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:120 ; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:116 -; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: s_waitcnt vmcnt(24) ; GFX950-NEXT: v_max_f64 v[62:63], v[6:7], v[48:49] ; GFX950-NEXT: v_cmp_u_f64_e64 s[2:3], v[6:7], v[48:49] ; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:128 ; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:124 +; GFX950-NEXT: scratch_load_dword v31, off, s32 ; GFX950-NEXT: s_waitcnt vmcnt(25) ; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[56:57] ; GFX950-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[56:57] @@ -2477,7 +2477,7 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: v_cndmask_b32_e64 v20, v52, 0, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v21, v53, v0, vcc ; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[22:23], v[34:35] -; GFX950-NEXT: s_waitcnt vmcnt(6) +; GFX950-NEXT: s_waitcnt vmcnt(7) ; GFX950-NEXT: v_max_f64 v[34:35], v[24:25], v[32:33] ; GFX950-NEXT: v_accvgpr_read_b32 v60, a13 ; Reload Reuse ; GFX950-NEXT: v_cndmask_b32_e64 v22, v50, 0, vcc @@ -2497,13 +2497,13 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: v_accvgpr_read_b32 v42, a3 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v41, a2 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v40, a1 ; Reload Reuse -; GFX950-NEXT: s_waitcnt vmcnt(4) +; GFX950-NEXT: s_waitcnt vmcnt(5) ; GFX950-NEXT: v_max_f64 v[32:33], v[26:27], v[36:37] ; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[36:37] ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v26, v32, 0, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v27, v33, v0, vcc -; GFX950-NEXT: s_waitcnt vmcnt(2) +; GFX950-NEXT: s_waitcnt vmcnt(3) ; GFX950-NEXT: v_max_f64 v[32:33], v[28:29], v[38:39] ; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[28:29], v[38:39] ; GFX950-NEXT: s_nop 1 @@ -2642,7 +2642,6 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16 @@ -2675,49 +2674,50 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:124 -; GFX11-NEXT: s_waitcnt vmcnt(30) +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_waitcnt vmcnt(31) ; GFX11-NEXT: v_max_f64 v[96:97], v[0:1], v[32:33] ; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[32:33] -; GFX11-NEXT: s_waitcnt vmcnt(28) +; GFX11-NEXT: s_waitcnt vmcnt(29) ; GFX11-NEXT: v_max_f64 v[32:33], v[2:3], v[34:35] ; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[34:35] -; GFX11-NEXT: s_waitcnt vmcnt(26) +; GFX11-NEXT: s_waitcnt vmcnt(27) ; GFX11-NEXT: v_max_f64 v[34:35], v[4:5], v[36:37] ; GFX11-NEXT: v_cmp_u_f64_e64 s1, v[4:5], v[36:37] -; GFX11-NEXT: s_waitcnt vmcnt(24) +; GFX11-NEXT: s_waitcnt vmcnt(25) ; GFX11-NEXT: v_max_f64 v[36:37], v[6:7], v[38:39] ; GFX11-NEXT: v_cmp_u_f64_e64 s2, v[6:7], v[38:39] -; GFX11-NEXT: s_waitcnt vmcnt(22) +; GFX11-NEXT: s_waitcnt vmcnt(23) ; GFX11-NEXT: v_max_f64 v[38:39], v[8:9], v[48:49] ; GFX11-NEXT: v_cmp_u_f64_e64 s3, v[8:9], v[48:49] -; GFX11-NEXT: s_waitcnt vmcnt(20) +; GFX11-NEXT: s_waitcnt vmcnt(21) ; GFX11-NEXT: v_max_f64 v[48:49], v[10:11], v[50:51] ; GFX11-NEXT: v_cmp_u_f64_e64 s4, v[10:11], v[50:51] -; GFX11-NEXT: s_waitcnt vmcnt(18) +; GFX11-NEXT: s_waitcnt vmcnt(19) ; GFX11-NEXT: v_max_f64 v[50:51], v[12:13], v[52:53] ; GFX11-NEXT: v_cmp_u_f64_e64 s5, v[12:13], v[52:53] -; GFX11-NEXT: s_waitcnt vmcnt(16) +; GFX11-NEXT: s_waitcnt vmcnt(17) ; GFX11-NEXT: v_max_f64 v[52:53], v[14:15], v[54:55] ; GFX11-NEXT: v_cmp_u_f64_e64 s6, v[14:15], v[54:55] -; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: s_waitcnt vmcnt(15) ; GFX11-NEXT: v_max_f64 v[54:55], v[16:17], v[64:65] ; GFX11-NEXT: v_cmp_u_f64_e64 s7, v[16:17], v[64:65] -; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: s_waitcnt vmcnt(13) ; GFX11-NEXT: v_max_f64 v[64:65], v[18:19], v[66:67] ; GFX11-NEXT: v_cmp_u_f64_e64 s8, v[18:19], v[66:67] -; GFX11-NEXT: s_waitcnt vmcnt(10) +; GFX11-NEXT: s_waitcnt vmcnt(11) ; GFX11-NEXT: v_max_f64 v[66:67], v[20:21], v[68:69] ; GFX11-NEXT: v_cmp_u_f64_e64 s9, v[20:21], v[68:69] -; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: s_waitcnt vmcnt(9) ; GFX11-NEXT: v_max_f64 v[68:69], v[22:23], v[70:71] ; GFX11-NEXT: v_cmp_u_f64_e64 s10, v[22:23], v[70:71] -; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: s_waitcnt vmcnt(7) ; GFX11-NEXT: v_max_f64 v[70:71], v[24:25], v[80:81] ; GFX11-NEXT: v_cmp_u_f64_e64 s11, v[24:25], v[80:81] -; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: s_waitcnt vmcnt(5) ; GFX11-NEXT: v_max_f64 v[80:81], v[26:27], v[82:83] ; GFX11-NEXT: v_cmp_u_f64_e64 s12, v[26:27], v[82:83] -; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_max_f64 v[82:83], v[28:29], v[84:85] ; GFX11-NEXT: v_cmp_u_f64_e64 s13, v[28:29], v[84:85] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2765,7 +2765,6 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1f -; GFX12-NEXT: scratch_load_b32 v31, off, s32 ; GFX12-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX12-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX12-NEXT: scratch_load_b32 v35, off, s32 offset:16 @@ -2798,35 +2797,36 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX12-NEXT: scratch_load_b32 v84, off, s32 offset:116 ; GFX12-NEXT: scratch_load_b32 v87, off, s32 offset:128 ; GFX12-NEXT: scratch_load_b32 v86, off, s32 offset:124 -; GFX12-NEXT: s_wait_loadcnt 0x1e +; GFX12-NEXT: scratch_load_b32 v31, off, s32 +; GFX12-NEXT: s_wait_loadcnt 0x1f ; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[32:33] -; GFX12-NEXT: s_wait_loadcnt 0x1c +; GFX12-NEXT: s_wait_loadcnt 0x1d ; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[34:35] -; GFX12-NEXT: s_wait_loadcnt 0x1a +; GFX12-NEXT: s_wait_loadcnt 0x1b ; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[36:37] -; GFX12-NEXT: s_wait_loadcnt 0x18 +; GFX12-NEXT: s_wait_loadcnt 0x19 ; GFX12-NEXT: v_maximum_f64 v[6:7], v[6:7], v[38:39] -; GFX12-NEXT: s_wait_loadcnt 0x16 +; GFX12-NEXT: s_wait_loadcnt 0x17 ; GFX12-NEXT: v_maximum_f64 v[8:9], v[8:9], v[48:49] -; GFX12-NEXT: s_wait_loadcnt 0x14 +; GFX12-NEXT: s_wait_loadcnt 0x15 ; GFX12-NEXT: v_maximum_f64 v[10:11], v[10:11], v[50:51] -; GFX12-NEXT: s_wait_loadcnt 0x12 +; GFX12-NEXT: s_wait_loadcnt 0x13 ; GFX12-NEXT: v_maximum_f64 v[12:13], v[12:13], v[52:53] -; GFX12-NEXT: s_wait_loadcnt 0x10 +; GFX12-NEXT: s_wait_loadcnt 0x11 ; GFX12-NEXT: v_maximum_f64 v[14:15], v[14:15], v[54:55] -; GFX12-NEXT: s_wait_loadcnt 0xe +; GFX12-NEXT: s_wait_loadcnt 0xf ; GFX12-NEXT: v_maximum_f64 v[16:17], v[16:17], v[64:65] -; GFX12-NEXT: s_wait_loadcnt 0xc +; GFX12-NEXT: s_wait_loadcnt 0xd ; GFX12-NEXT: v_maximum_f64 v[18:19], v[18:19], v[66:67] -; GFX12-NEXT: s_wait_loadcnt 0xa +; GFX12-NEXT: s_wait_loadcnt 0xb ; GFX12-NEXT: v_maximum_f64 v[20:21], v[20:21], v[68:69] -; GFX12-NEXT: s_wait_loadcnt 0x8 +; GFX12-NEXT: s_wait_loadcnt 0x9 ; GFX12-NEXT: v_maximum_f64 v[22:23], v[22:23], v[70:71] -; GFX12-NEXT: s_wait_loadcnt 0x6 +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: v_maximum_f64 v[24:25], v[24:25], v[80:81] -; GFX12-NEXT: s_wait_loadcnt 0x4 +; GFX12-NEXT: s_wait_loadcnt 0x5 ; GFX12-NEXT: v_maximum_f64 v[26:27], v[26:27], v[82:83] -; GFX12-NEXT: s_wait_loadcnt 0x2 +; GFX12-NEXT: s_wait_loadcnt 0x3 ; GFX12-NEXT: v_maximum_f64 v[28:29], v[28:29], v[84:85] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_maximum_f64 v[30:31], v[30:31], v[86:87] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index 863240cc591c3..524c4a557cc04 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -458,44 +458,40 @@ define amdgpu_kernel void @maxnum_v2f16( ; ; VI-LABEL: maxnum_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s8, s[8:9], 0x0 -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_load_dword s6, s[2:3], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v1, s2, s2 -; VI-NEXT: s_lshr_b32 s0, s8, 16 +; VI-NEXT: v_max_f16_e64 v1, s6, s6 +; VI-NEXT: v_max_f16_e64 v0, s4, s4 +; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_max_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_max_f16_e64 v1, s4, s4 +; VI-NEXT: s_lshr_b32 s4, s6, 16 +; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 -; GFX9-NEXT: s_load_dword s11, s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 +; GFX9-NEXT: v_pk_max_f16 v0, s5, s5 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v2f16: @@ -504,6 +500,7 @@ define amdgpu_kernel void @maxnum_v2f16( ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 @@ -521,6 +518,7 @@ define amdgpu_kernel void @maxnum_v2f16( ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s4, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -753,52 +751,48 @@ define amdgpu_kernel void @maxnum_v3f16( ; ; VI-LABEL: maxnum_v3f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v1, s2, s2 -; VI-NEXT: s_lshr_b32 s0, s8, 16 +; VI-NEXT: v_max_f16_e64 v1, s6, s6 +; VI-NEXT: v_max_f16_e64 v0, s4, s4 +; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_max_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_max_f16_e64 v1, s4, s4 +; VI-NEXT: s_lshr_b32 s4, s6, 16 +; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_max_f16_e64 v1, s9, s9 -; VI-NEXT: v_max_f16_e64 v2, s3, s3 +; VI-NEXT: v_max_f16_e64 v1, s5, s5 +; VI-NEXT: v_max_f16_e64 v2, s7, s7 ; VI-NEXT: v_max_f16_e32 v1, v2, v1 -; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_v3f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 -; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 +; GFX9-NEXT: v_pk_max_f16 v0, s8, s8 +; GFX9-NEXT: v_pk_max_f16 v2, s9, s9 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 +; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 -; GFX9-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v3f16: @@ -807,6 +801,7 @@ define amdgpu_kernel void @maxnum_v3f16( ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 @@ -828,6 +823,7 @@ define amdgpu_kernel void @maxnum_v3f16( ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -897,56 +893,52 @@ define amdgpu_kernel void @maxnum_v4f16( ; ; VI-LABEL: maxnum_v4f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s9, s9 -; VI-NEXT: v_max_f16_e64 v1, s3, s3 -; VI-NEXT: s_lshr_b32 s0, s9, 16 +; VI-NEXT: v_max_f16_e64 v1, s7, s7 +; VI-NEXT: v_max_f16_e64 v0, s5, s5 +; VI-NEXT: s_lshr_b32 s5, s5, 16 ; VI-NEXT: v_max_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s3, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_max_f16_e64 v1, s5, s5 +; VI-NEXT: s_lshr_b32 s5, s7, 16 +; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v2, s2, s2 -; VI-NEXT: s_lshr_b32 s0, s8, 16 +; VI-NEXT: v_max_f16_e64 v0, s4, s4 +; VI-NEXT: v_max_f16_e64 v2, s6, s6 +; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_max_f16_e32 v0, v2, v0 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v3, s0, s0 +; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: s_lshr_b32 s4, s6, 16 +; VI-NEXT: v_max_f16_e64 v3, s4, s4 ; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 -; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 -; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 +; GFX9-NEXT: v_pk_max_f16 v0, s9, s9 +; GFX9-NEXT: v_pk_max_f16 v2, s8, s8 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 +; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v4f16: @@ -955,6 +947,7 @@ define amdgpu_kernel void @maxnum_v4f16( ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 @@ -975,6 +968,7 @@ define amdgpu_kernel void @maxnum_v4f16( ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll index 1d1673315f6ff..73059dbb3f752 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll @@ -2399,7 +2399,6 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:84 ; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:96 ; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:92 -; GFX950-NEXT: scratch_load_dword v31, off, s32 ; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:104 ; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:100 ; GFX950-NEXT: v_accvgpr_write_b32 a11, v58 ; Reload Reuse @@ -2408,21 +2407,22 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: v_accvgpr_write_b32 a14, v61 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a15, v62 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a16, v63 ; Reload Reuse -; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: s_waitcnt vmcnt(24) ; GFX950-NEXT: v_min_f64 v[58:59], v[2:3], v[36:37] ; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[36:37] ; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:112 ; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:108 -; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: s_waitcnt vmcnt(24) ; GFX950-NEXT: v_min_f64 v[60:61], v[4:5], v[38:39] ; GFX950-NEXT: v_cmp_u_f64_e64 s[0:1], v[4:5], v[38:39] ; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:120 ; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:116 -; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: s_waitcnt vmcnt(24) ; GFX950-NEXT: v_min_f64 v[62:63], v[6:7], v[48:49] ; GFX950-NEXT: v_cmp_u_f64_e64 s[2:3], v[6:7], v[48:49] ; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:128 ; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:124 +; GFX950-NEXT: scratch_load_dword v31, off, s32 ; GFX950-NEXT: s_waitcnt vmcnt(25) ; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[56:57] ; GFX950-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[56:57] @@ -2477,7 +2477,7 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: v_cndmask_b32_e64 v20, v52, 0, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v21, v53, v0, vcc ; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[22:23], v[34:35] -; GFX950-NEXT: s_waitcnt vmcnt(6) +; GFX950-NEXT: s_waitcnt vmcnt(7) ; GFX950-NEXT: v_min_f64 v[34:35], v[24:25], v[32:33] ; GFX950-NEXT: v_accvgpr_read_b32 v60, a13 ; Reload Reuse ; GFX950-NEXT: v_cndmask_b32_e64 v22, v50, 0, vcc @@ -2497,13 +2497,13 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX950-NEXT: v_accvgpr_read_b32 v42, a3 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v41, a2 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v40, a1 ; Reload Reuse -; GFX950-NEXT: s_waitcnt vmcnt(4) +; GFX950-NEXT: s_waitcnt vmcnt(5) ; GFX950-NEXT: v_min_f64 v[32:33], v[26:27], v[36:37] ; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[36:37] ; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e64 v26, v32, 0, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v27, v33, v0, vcc -; GFX950-NEXT: s_waitcnt vmcnt(2) +; GFX950-NEXT: s_waitcnt vmcnt(3) ; GFX950-NEXT: v_min_f64 v[32:33], v[28:29], v[38:39] ; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[28:29], v[38:39] ; GFX950-NEXT: s_nop 1 @@ -2642,7 +2642,6 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16 @@ -2675,49 +2674,50 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:124 -; GFX11-NEXT: s_waitcnt vmcnt(30) +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_waitcnt vmcnt(31) ; GFX11-NEXT: v_min_f64 v[96:97], v[0:1], v[32:33] ; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[32:33] -; GFX11-NEXT: s_waitcnt vmcnt(28) +; GFX11-NEXT: s_waitcnt vmcnt(29) ; GFX11-NEXT: v_min_f64 v[32:33], v[2:3], v[34:35] ; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[34:35] -; GFX11-NEXT: s_waitcnt vmcnt(26) +; GFX11-NEXT: s_waitcnt vmcnt(27) ; GFX11-NEXT: v_min_f64 v[34:35], v[4:5], v[36:37] ; GFX11-NEXT: v_cmp_u_f64_e64 s1, v[4:5], v[36:37] -; GFX11-NEXT: s_waitcnt vmcnt(24) +; GFX11-NEXT: s_waitcnt vmcnt(25) ; GFX11-NEXT: v_min_f64 v[36:37], v[6:7], v[38:39] ; GFX11-NEXT: v_cmp_u_f64_e64 s2, v[6:7], v[38:39] -; GFX11-NEXT: s_waitcnt vmcnt(22) +; GFX11-NEXT: s_waitcnt vmcnt(23) ; GFX11-NEXT: v_min_f64 v[38:39], v[8:9], v[48:49] ; GFX11-NEXT: v_cmp_u_f64_e64 s3, v[8:9], v[48:49] -; GFX11-NEXT: s_waitcnt vmcnt(20) +; GFX11-NEXT: s_waitcnt vmcnt(21) ; GFX11-NEXT: v_min_f64 v[48:49], v[10:11], v[50:51] ; GFX11-NEXT: v_cmp_u_f64_e64 s4, v[10:11], v[50:51] -; GFX11-NEXT: s_waitcnt vmcnt(18) +; GFX11-NEXT: s_waitcnt vmcnt(19) ; GFX11-NEXT: v_min_f64 v[50:51], v[12:13], v[52:53] ; GFX11-NEXT: v_cmp_u_f64_e64 s5, v[12:13], v[52:53] -; GFX11-NEXT: s_waitcnt vmcnt(16) +; GFX11-NEXT: s_waitcnt vmcnt(17) ; GFX11-NEXT: v_min_f64 v[52:53], v[14:15], v[54:55] ; GFX11-NEXT: v_cmp_u_f64_e64 s6, v[14:15], v[54:55] -; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: s_waitcnt vmcnt(15) ; GFX11-NEXT: v_min_f64 v[54:55], v[16:17], v[64:65] ; GFX11-NEXT: v_cmp_u_f64_e64 s7, v[16:17], v[64:65] -; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: s_waitcnt vmcnt(13) ; GFX11-NEXT: v_min_f64 v[64:65], v[18:19], v[66:67] ; GFX11-NEXT: v_cmp_u_f64_e64 s8, v[18:19], v[66:67] -; GFX11-NEXT: s_waitcnt vmcnt(10) +; GFX11-NEXT: s_waitcnt vmcnt(11) ; GFX11-NEXT: v_min_f64 v[66:67], v[20:21], v[68:69] ; GFX11-NEXT: v_cmp_u_f64_e64 s9, v[20:21], v[68:69] -; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: s_waitcnt vmcnt(9) ; GFX11-NEXT: v_min_f64 v[68:69], v[22:23], v[70:71] ; GFX11-NEXT: v_cmp_u_f64_e64 s10, v[22:23], v[70:71] -; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: s_waitcnt vmcnt(7) ; GFX11-NEXT: v_min_f64 v[70:71], v[24:25], v[80:81] ; GFX11-NEXT: v_cmp_u_f64_e64 s11, v[24:25], v[80:81] -; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: s_waitcnt vmcnt(5) ; GFX11-NEXT: v_min_f64 v[80:81], v[26:27], v[82:83] ; GFX11-NEXT: v_cmp_u_f64_e64 s12, v[26:27], v[82:83] -; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_min_f64 v[82:83], v[28:29], v[84:85] ; GFX11-NEXT: v_cmp_u_f64_e64 s13, v[28:29], v[84:85] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2765,7 +2765,6 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1f -; GFX12-NEXT: scratch_load_b32 v31, off, s32 ; GFX12-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX12-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX12-NEXT: scratch_load_b32 v35, off, s32 offset:16 @@ -2798,35 +2797,36 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX12-NEXT: scratch_load_b32 v84, off, s32 offset:116 ; GFX12-NEXT: scratch_load_b32 v87, off, s32 offset:128 ; GFX12-NEXT: scratch_load_b32 v86, off, s32 offset:124 -; GFX12-NEXT: s_wait_loadcnt 0x1e +; GFX12-NEXT: scratch_load_b32 v31, off, s32 +; GFX12-NEXT: s_wait_loadcnt 0x1f ; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[32:33] -; GFX12-NEXT: s_wait_loadcnt 0x1c +; GFX12-NEXT: s_wait_loadcnt 0x1d ; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[34:35] -; GFX12-NEXT: s_wait_loadcnt 0x1a +; GFX12-NEXT: s_wait_loadcnt 0x1b ; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[36:37] -; GFX12-NEXT: s_wait_loadcnt 0x18 +; GFX12-NEXT: s_wait_loadcnt 0x19 ; GFX12-NEXT: v_minimum_f64 v[6:7], v[6:7], v[38:39] -; GFX12-NEXT: s_wait_loadcnt 0x16 +; GFX12-NEXT: s_wait_loadcnt 0x17 ; GFX12-NEXT: v_minimum_f64 v[8:9], v[8:9], v[48:49] -; GFX12-NEXT: s_wait_loadcnt 0x14 +; GFX12-NEXT: s_wait_loadcnt 0x15 ; GFX12-NEXT: v_minimum_f64 v[10:11], v[10:11], v[50:51] -; GFX12-NEXT: s_wait_loadcnt 0x12 +; GFX12-NEXT: s_wait_loadcnt 0x13 ; GFX12-NEXT: v_minimum_f64 v[12:13], v[12:13], v[52:53] -; GFX12-NEXT: s_wait_loadcnt 0x10 +; GFX12-NEXT: s_wait_loadcnt 0x11 ; GFX12-NEXT: v_minimum_f64 v[14:15], v[14:15], v[54:55] -; GFX12-NEXT: s_wait_loadcnt 0xe +; GFX12-NEXT: s_wait_loadcnt 0xf ; GFX12-NEXT: v_minimum_f64 v[16:17], v[16:17], v[64:65] -; GFX12-NEXT: s_wait_loadcnt 0xc +; GFX12-NEXT: s_wait_loadcnt 0xd ; GFX12-NEXT: v_minimum_f64 v[18:19], v[18:19], v[66:67] -; GFX12-NEXT: s_wait_loadcnt 0xa +; GFX12-NEXT: s_wait_loadcnt 0xb ; GFX12-NEXT: v_minimum_f64 v[20:21], v[20:21], v[68:69] -; GFX12-NEXT: s_wait_loadcnt 0x8 +; GFX12-NEXT: s_wait_loadcnt 0x9 ; GFX12-NEXT: v_minimum_f64 v[22:23], v[22:23], v[70:71] -; GFX12-NEXT: s_wait_loadcnt 0x6 +; GFX12-NEXT: s_wait_loadcnt 0x7 ; GFX12-NEXT: v_minimum_f64 v[24:25], v[24:25], v[80:81] -; GFX12-NEXT: s_wait_loadcnt 0x4 +; GFX12-NEXT: s_wait_loadcnt 0x5 ; GFX12-NEXT: v_minimum_f64 v[26:27], v[26:27], v[82:83] -; GFX12-NEXT: s_wait_loadcnt 0x2 +; GFX12-NEXT: s_wait_loadcnt 0x3 ; GFX12-NEXT: v_minimum_f64 v[28:29], v[28:29], v[84:85] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_minimum_f64 v[30:31], v[30:31], v[86:87] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index 7e8c30161c1c8..ba80a37c1a9d8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -490,44 +490,40 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; ; VI-LABEL: minnum_v2f16_ieee: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s8, s[8:9], 0x0 -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_load_dword s6, s[2:3], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v1, s2, s2 -; VI-NEXT: s_lshr_b32 s0, s8, 16 +; VI-NEXT: v_max_f16_e64 v1, s6, s6 +; VI-NEXT: v_max_f16_e64 v0, s4, s4 +; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_min_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_max_f16_e64 v1, s4, s4 +; VI-NEXT: s_lshr_b32 s4, s6, 16 +; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_v2f16_ieee: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 -; GFX9-NEXT: s_load_dword s11, s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 +; GFX9-NEXT: v_pk_max_f16 v0, s5, s5 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v2f16_ieee: @@ -536,6 +532,7 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 @@ -553,6 +550,7 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s4, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -817,52 +815,48 @@ define amdgpu_kernel void @minnum_v3f16( ; ; VI-LABEL: minnum_v3f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v1, s2, s2 -; VI-NEXT: s_lshr_b32 s0, s8, 16 +; VI-NEXT: v_max_f16_e64 v1, s6, s6 +; VI-NEXT: v_max_f16_e64 v0, s4, s4 +; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_min_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_max_f16_e64 v1, s4, s4 +; VI-NEXT: s_lshr_b32 s4, s6, 16 +; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_max_f16_e64 v1, s9, s9 -; VI-NEXT: v_max_f16_e64 v2, s3, s3 +; VI-NEXT: v_max_f16_e64 v1, s5, s5 +; VI-NEXT: v_max_f16_e64 v2, s7, s7 ; VI-NEXT: v_min_f16_e32 v1, v2, v1 -; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_v3f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 -; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 +; GFX9-NEXT: v_pk_max_f16 v0, s8, s8 +; GFX9-NEXT: v_pk_max_f16 v2, s9, s9 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 +; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 -; GFX9-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v3f16: @@ -871,6 +865,7 @@ define amdgpu_kernel void @minnum_v3f16( ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 @@ -892,6 +887,7 @@ define amdgpu_kernel void @minnum_v3f16( ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -960,56 +956,52 @@ define amdgpu_kernel void @minnum_v4f16( ; ; VI-LABEL: minnum_v4f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s9, s9 -; VI-NEXT: v_max_f16_e64 v1, s3, s3 -; VI-NEXT: s_lshr_b32 s0, s9, 16 +; VI-NEXT: v_max_f16_e64 v1, s7, s7 +; VI-NEXT: v_max_f16_e64 v0, s5, s5 +; VI-NEXT: s_lshr_b32 s5, s5, 16 ; VI-NEXT: v_min_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s3, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_max_f16_e64 v1, s5, s5 +; VI-NEXT: s_lshr_b32 s5, s7, 16 +; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v2, s2, s2 -; VI-NEXT: s_lshr_b32 s0, s8, 16 +; VI-NEXT: v_max_f16_e64 v0, s4, s4 +; VI-NEXT: v_max_f16_e64 v2, s6, s6 +; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_min_f16_e32 v0, v2, v0 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v3, s0, s0 +; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: s_lshr_b32 s4, s6, 16 +; VI-NEXT: v_max_f16_e64 v3, s4, s4 ; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 -; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 -; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 +; GFX9-NEXT: v_pk_max_f16 v0, s9, s9 +; GFX9-NEXT: v_pk_max_f16 v2, s8, s8 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 +; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v4f16: @@ -1018,6 +1010,7 @@ define amdgpu_kernel void @minnum_v4f16( ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 @@ -1038,6 +1031,7 @@ define amdgpu_kernel void @minnum_v4f16( ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll index 9e518589ac5b3..0a9a41b2010a6 100644 --- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll @@ -20,12 +20,12 @@ define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], ptr %p ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_add_u32 s0, s0, 4 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_load_dword v0, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: flat_load_dword v1, v[1:2] +; GCN-NEXT: flat_load_dword v1, v[2:3] ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll index 1857eaba0a2a9..f4b03512c8915 100644 --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -114,13 +114,13 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v2 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v4, v[4:5] -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v7, v[2:3] -; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v5, v[2:3] ; VI-NEXT: flat_load_ushort v8, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 @@ -128,11 +128,11 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_max_i16_e32 v6, v5, v7 -; VI-NEXT: v_max_i16_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_max_i16_e32 v6, v7, v5 +; VI-NEXT: v_max_i16_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_i16_e32 v4, v4, v8 +; VI-NEXT: v_max_i16_e32 v4, v8, v4 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 ; VI-NEXT: flat_store_short v[2:3], v4 ; VI-NEXT: flat_store_dword v[0:1], v5 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 05ffaf62ff1e0..c68bc647d1f91 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -2613,6 +2613,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX10-NEXT: s_load_dword s9, s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2631,6 +2632,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11-NEXT: s_load_b32 s5, s[6:7], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll index 0f67a404972aa..ab38e91af8bb4 100644 --- a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll +++ b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll @@ -10,11 +10,13 @@ define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perSh ; GFX11-NEXT: s_mov_b32 s0, s3 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: s_load_b256 s[20:27], s[2:3], 0x40 ; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 ; GFX11-NEXT: s_load_b512 s[36:51], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0 ; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0 ; GFX11-NEXT: image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 @@ -48,10 +50,12 @@ define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perSh ; GFX12-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 ; GFX12-NEXT: s_mov_b32 s3, s5 ; GFX12-NEXT: s_mov_b32 s1, s5 +; GFX12-NEXT: s_clause 0x2 ; GFX12-NEXT: s_load_b256 s[20:27], s[2:3], 0x40 ; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 ; GFX12-NEXT: s_load_b512 s[36:51], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_clause 0x4 ; GFX12-NEXT: buffer_load_b32 v1, off, s[20:23], null ; GFX12-NEXT: buffer_load_b32 v2, off, s[16:19], null ; GFX12-NEXT: image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 @@ -85,11 +89,12 @@ define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perSh ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 ; GFX12-GISEL-NEXT: s_mov_b32 s1, s21 ; GFX12-GISEL-NEXT: s_mov_b32 s3, s21 +; GFX12-GISEL-NEXT: s_clause 0x2 ; GFX12-GISEL-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 -; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b256 s[20:27], s[2:3], 0x40 ; GFX12-GISEL-NEXT: s_load_b512 s[36:51], s[2:3], 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_clause 0x4 ; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-GISEL-NEXT: buffer_load_b32 v2, off, s[16:19], null ; GFX12-GISEL-NEXT: buffer_load_b32 v3, off, s[20:23], null diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 0f47a31f52dcb..cf4813772530a 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -500,6 +500,7 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s2 ; GFX10-NEXT: s_mov_b32 s13, s3 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; GFX10-NEXT: s_mov_b32 s4, s0 @@ -523,6 +524,7 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s12, s2 ; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[4:7], 0 ; GFX11-NEXT: s_mov_b32 s8, s0 @@ -546,6 +548,7 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s12, s2 ; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null ; GFX12-NEXT: buffer_load_b32 v1, off, s[4:7], null ; GFX12-NEXT: s_mov_b32 s8, s0 @@ -1884,24 +1887,24 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_lo_u32 v1, v2, v1 -; SI-NEXT: v_mul_hi_u32 v4, v2, v0 -; SI-NEXT: v_mul_lo_u32 v3, v3, v0 -; SI-NEXT: v_mul_lo_u32 v0, v2, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; SI-NEXT: v_mul_lo_u32 v3, v0, v3 +; SI-NEXT: v_mul_hi_u32 v4, v0, v2 +; SI-NEXT: v_mul_lo_u32 v1, v1, v2 +; SI-NEXT: v_mul_lo_u32 v0, v0, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, v3, v4 +; SI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -1911,24 +1914,24 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s2 ; VI-NEXT: s_mov_b32 s13, s3 -; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_mov_b32 s15, s7 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mul_lo_u32 v4, v2, v1 -; VI-NEXT: v_mad_u64_u32 v[1:2], s[2:3], v2, v0, 0 -; VI-NEXT: v_mul_lo_u32 v0, v3, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 +; VI-NEXT: v_mul_lo_u32 v5, v0, v3 +; VI-NEXT: v_mad_u64_u32 v[3:4], s[2:3], v0, v2, 0 +; VI-NEXT: v_mul_lo_u32 v0, v1, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v4, vcc, v1, v0 +; VI-NEXT: buffer_store_dwordx2 v[3:4], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_mul_i64: @@ -1937,24 +1940,24 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_mov_b32 s14, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s2 ; GFX9-NEXT: s_mov_b32 s13, s3 -; GFX9-NEXT: s_mov_b32 s14, s6 ; GFX9-NEXT: s_mov_b32 s15, s7 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, v2, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v3, v0 -; GFX9-NEXT: v_mul_lo_u32 v0, v2, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v4, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX9-NEXT: v_mul_hi_u32 v4, v0, v2 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -1965,24 +1968,25 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s10, s6 -; GFX10-NEXT: s_mov_b32 s11, s7 ; GFX10-NEXT: s_mov_b32 s14, s6 ; GFX10-NEXT: s_mov_b32 s15, s7 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_mov_b32 s11, s7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s2 ; GFX10-NEXT: s_mov_b32 s13, s3 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX10-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; GFX10-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 ; GFX10-NEXT: s_mov_b32 s4, s0 ; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX10-NEXT: v_mul_hi_u32 v4, v2, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, v3, v0 -; GFX10-NEXT: v_mul_lo_u32 v0, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v4, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX10-NEXT: v_mul_hi_u32 v4, v0, v2 +; GFX10-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v3, v1 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; @@ -1993,25 +1997,26 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_mov_b32 s14, s10 ; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s12, s2 ; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[4:7], 0 ; GFX11-NEXT: s_mov_b32 s8, s0 ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX11-NEXT: v_mul_hi_u32 v4, v2, v0 -; GFX11-NEXT: v_mul_lo_u32 v3, v3, v0 -; GFX11-NEXT: v_mul_lo_u32 v0, v2, v0 +; GFX11-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX11-NEXT: v_mul_hi_u32 v4, v0, v2 +; GFX11-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX11-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v4, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v3, v1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 ; GFX11-NEXT: s_endpgm ; @@ -2022,24 +2027,25 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_mov_b32 s10, -1 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000 -; GFX12-NEXT: s_mov_b32 s6, s10 -; GFX12-NEXT: s_mov_b32 s7, s11 ; GFX12-NEXT: s_mov_b32 s14, s10 ; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s6, s10 +; GFX12-NEXT: s_mov_b32 s7, s11 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s12, s2 ; GFX12-NEXT: s_mov_b32 s13, s3 -; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null -; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[12:15], null +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null +; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[4:7], null ; GFX12-NEXT: s_mov_b32 s8, s0 ; GFX12-NEXT: s_mov_b32 s9, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mul_lo_u32 v3, v0, v3 -; GFX12-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX12-NEXT: v_mul_hi_u32 v4, v0, v2 -; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX12-NEXT: v_mul_lo_u32 v1, v2, v1 +; GFX12-NEXT: v_mul_lo_u32 v3, v3, v0 +; GFX12-NEXT: v_mul_hi_u32 v4, v2, v0 +; GFX12-NEXT: v_mul_lo_u32 v0, v2, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_nc_u32_e32 v1, v3, v1 +; GFX12-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX12-NEXT: v_add_nc_u32_e32 v1, v1, v4 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index cc9650b9a7309..a40a766a6169a 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -734,20 +734,20 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s10, s6 -; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: s_mov_b32 s14, s6 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s12, s2 ; GFX6-NEXT: s_mov_b32 s13, s3 -; GFX6-NEXT: s_mov_b32 s14, s6 ; GFX6-NEXT: s_mov_b32 s15, s7 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX6-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; GFX6-NEXT: s_mov_b32 s10, s6 +; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; GFX6-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -757,20 +757,20 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX8-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: s_mov_b32 s10, s6 -; GFX8-NEXT: s_mov_b32 s11, s7 +; GFX8-NEXT: s_mov_b32 s14, s6 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s12, s2 ; GFX8-NEXT: s_mov_b32 s13, s3 -; GFX8-NEXT: s_mov_b32 s14, s6 ; GFX8-NEXT: s_mov_b32 s15, s7 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; GFX8-NEXT: s_mov_b32 s10, s6 +; GFX8-NEXT: s_mov_b32 s11, s7 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; GFX8-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 ; GFX8-NEXT: s_mov_b32 s4, s0 ; GFX8-NEXT: s_mov_b32 s5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index 120aebf2bf7c8..b629551a81bbf 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -33,6 +33,7 @@ define hidden void @shuffle3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt ; GFX10-LABEL: shuffle3744: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -116,6 +117,7 @@ define hidden void @shuffle1004(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt ; GFX10-LABEL: shuffle1004: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -147,6 +149,7 @@ define hidden void @shuffle7533(ptr addrspace(0) %in0, ptr addrspace(0) %in1, pt ; GFX10-LABEL: shuffle7533: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: flat_load_dword v6, v[0:1] ; GFX10-NEXT: flat_load_dword v7, v[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -295,6 +298,7 @@ define hidden void @shuffle3546(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt ; GFX10-LABEL: shuffle3546: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -564,6 +568,7 @@ define hidden void @addUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 % ; GFX10-LABEL: addUsesOr: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -613,6 +618,7 @@ define amdgpu_kernel void @shuffle8i8(ptr addrspace(1) %in0, ptr addrspace(1) %i ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -688,6 +694,7 @@ define hidden void @add(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, p ; GFX10-LABEL: add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -744,6 +751,7 @@ define hidden void @add_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %el ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -794,6 +802,7 @@ define hidden void @add_store(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 % ; GFX10-LABEL: add_store: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v9, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -853,6 +862,7 @@ define hidden void @add_store_div_16(ptr addrspace(1) %in0, ptr addrspace(1) %in ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v9, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -921,6 +931,7 @@ define hidden void @add_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v9, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -991,6 +1002,7 @@ define hidden void @and_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v9, v[0:1], off ; GFX10-NEXT: v_mov_b32_e32 v0, 2 @@ -1129,6 +1141,7 @@ define hidden void @bc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v9, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1178,6 +1191,7 @@ define hidden void @eve_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -1231,6 +1245,7 @@ define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v9 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v9, v[0:1], off ; GFX10-NEXT: global_load_dword v10, v[2:3], off ; GFX10-NEXT: v_mov_b32_e32 v0, 16 @@ -1298,6 +1313,7 @@ define hidden void @lhsr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v9, v[2:3], off ; GFX10-NEXT: v_mov_b32_e32 v0, 26 @@ -1367,6 +1383,7 @@ define hidden void @mul_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v9, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -1437,6 +1454,7 @@ define hidden void @or_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v9, v[0:1], off ; GFX10-NEXT: v_mov_b32_e32 v0, 16 @@ -1501,33 +1519,34 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: global_load_dword v4, v[2:3], off -; GFX10-NEXT: global_load_dword v9, v[0:1], off +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v9, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v10, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v19, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v15, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v16, v10 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v12 -; GFX10-NEXT: v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX10-NEXT: v_xor_b32_sdwa v0, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v14 -; GFX10-NEXT: v_xor_b32_sdwa v3, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 -; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 -; GFX10-NEXT: v_xor_b32_sdwa v13, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 +; GFX10-NEXT: v_xor_b32_sdwa v3, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 +; GFX10-NEXT: v_xor_b32_sdwa v13, sext(v9), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 ; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; GFX10-NEXT: v_mul_f32_e32 v15, v2, v15 ; GFX10-NEXT: v_mul_f32_e32 v16, v19, v16 -; GFX10-NEXT: v_ashrrev_i32_e32 v3, 30, v3 ; GFX10-NEXT: v_mul_f32_e32 v17, v2, v17 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 30, v3 ; GFX10-NEXT: v_or_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_trunc_f32_e32 v15, v15 ; GFX10-NEXT: v_trunc_f32_e32 v16, v16 @@ -1562,7 +1581,7 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_add_nc_u32_sdwa v3, v18, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x60706 +; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x60706 ; GFX10-NEXT: global_store_dword v[5:6], v0, off ; GFX10-NEXT: global_store_dword v[7:8], v1, off ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1576,26 +1595,26 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v4, v[2:3], off -; GFX9-NEXT: global_load_dword v9, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off ; GFX9-NEXT: s_mov_b32 s4, 0x60706 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4 -; GFX9-NEXT: v_xor_b32_sdwa v1, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX9-NEXT: v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v11, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX9-NEXT: v_xor_b32_sdwa v9, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX9-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v12, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; GFX9-NEXT: v_perm_b32 v0, v4, v9, s4 +; GFX9-NEXT: v_xor_b32_sdwa v1, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX9-NEXT: v_xor_b32_sdwa v10, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v11, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_xor_b32_sdwa v4, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX9-NEXT: v_xor_b32_sdwa v14, sext(v9), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v9, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v12 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v13 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v9 ; GFX9-NEXT: v_mul_f32_e32 v15, v3, v15 ; GFX9-NEXT: v_mul_f32_e32 v16, v11, v16 ; GFX9-NEXT: v_trunc_f32_e32 v15, v15 @@ -1610,23 +1629,23 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: v_trunc_f32_e32 v18, v18 ; GFX9-NEXT: v_mad_f32 v11, -v16, v12, v11 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v2| -; GFX9-NEXT: v_ashrrev_i32_e32 v9, 30, v9 +; GFX9-NEXT: v_ashrrev_i32_e32 v4, 30, v4 ; GFX9-NEXT: v_or_b32_e32 v10, 1, v10 ; GFX9-NEXT: v_cvt_i32_f32_e32 v15, v15 ; GFX9-NEXT: v_cvt_i32_f32_e32 v16, v16 ; GFX9-NEXT: v_mad_f32 v3, -v17, v13, v3 ; GFX9-NEXT: v_cvt_i32_f32_e32 v17, v17 -; GFX9-NEXT: v_mad_f32 v2, -v18, v4, v2 +; GFX9-NEXT: v_mad_f32 v2, -v18, v9, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v11|, |v12| ; GFX9-NEXT: v_ashrrev_i32_e32 v14, 30, v14 -; GFX9-NEXT: v_or_b32_e32 v9, 1, v9 +; GFX9-NEXT: v_or_b32_e32 v4, 1, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v13| ; GFX9-NEXT: v_or_b32_e32 v14, 1, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v4| +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v9| ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v14, vcc ; GFX9-NEXT: v_add_u32_e32 v1, v15, v1 ; GFX9-NEXT: v_add_u32_sdwa v4, v16, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -1662,6 +1681,7 @@ define hidden void @sext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v9, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -1726,6 +1746,7 @@ define hidden void @shl_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v9, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -1795,6 +1816,7 @@ define hidden void @sitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v9, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -1858,6 +1880,7 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v9, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -2037,6 +2060,7 @@ define hidden void @sub_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v2, v[2:3], off ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -2106,6 +2130,7 @@ define hidden void @sv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2153,6 +2178,7 @@ define hidden void @trunc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v9, v[2:3], off ; GFX10-NEXT: v_mov_b32_e32 v0, 1 @@ -2223,6 +2249,7 @@ define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v2, v[2:3], off ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -2280,45 +2307,45 @@ define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v4, v[2:3], off -; GFX9-NEXT: global_load_dword v9, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off ; GFX9-NEXT: s_mov_b32 s4, 0x40207 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v1, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v11, v2 -; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v4 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v12, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v1, v9 -; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v10, v4 +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v10, v9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v13, v10 ; GFX9-NEXT: v_mul_f32_e32 v11, v1, v11 -; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4 -; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v4, v4 +; GFX9-NEXT: v_perm_b32 v0, v4, v9, s4 +; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v9, v9 ; GFX9-NEXT: v_trunc_f32_e32 v11, v11 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v9 ; GFX9-NEXT: v_mul_f32_e32 v12, v10, v12 ; GFX9-NEXT: v_mad_f32 v1, -v11, v2, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v9, v9 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v4 ; GFX9-NEXT: v_trunc_f32_e32 v12, v12 -; GFX9-NEXT: v_mul_f32_e32 v13, v9, v13 +; GFX9-NEXT: v_mul_f32_e32 v13, v4, v13 ; GFX9-NEXT: v_mad_f32 v15, -v12, v3, v10 ; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GFX9-NEXT: v_trunc_f32_e32 v13, v13 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v2 ; GFX9-NEXT: v_mul_f32_e32 v14, v2, v14 -; GFX9-NEXT: v_mad_f32 v9, -v13, v10, v9 +; GFX9-NEXT: v_mad_f32 v4, -v13, v10, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v13 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v11, vcc ; GFX9-NEXT: v_trunc_f32_e32 v14, v14 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v15|, v3 -; GFX9-NEXT: v_mad_f32 v16, -v14, v4, v2 +; GFX9-NEXT: v_mad_f32 v16, -v14, v9, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v12, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v9|, v10 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v13, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v16|, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v16|, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v14, vcc ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4 @@ -2352,6 +2379,7 @@ define hidden void @uitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v9, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -2411,6 +2439,7 @@ define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v2, v[2:3], off ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -2556,6 +2585,7 @@ define hidden void @xor_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v9, v[2:3], off ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffff00 @@ -2631,6 +2661,7 @@ define hidden void @zext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v9, v[2:3], off ; GFX10-NEXT: v_mov_b32_e32 v0, 0xff @@ -2724,6 +2755,7 @@ define hidden void @extract3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt ; GFX10-LABEL: extract3744: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2768,6 +2800,7 @@ define hidden void @extract_perm_3744(ptr addrspace(1) %in0, ptr addrspace(1) %i ; GFX10-LABEL: extract_perm_3744: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2801,6 +2834,7 @@ define hidden void @extract1347_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %i ; GFX10-LABEL: extract1347_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2858,6 +2892,7 @@ define hidden void @fshri16_8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr ; GFX10-LABEL: fshri16_8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2898,6 +2933,7 @@ define hidden void @fshri16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr ; GFX10-LABEL: fshri16_16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2938,6 +2974,7 @@ define hidden void @fshri16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr ; GFX10-LABEL: fshri16_24: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2978,6 +3015,7 @@ define hidden void @fshri16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr ; GFX10-LABEL: fshri16_32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3018,6 +3056,7 @@ define hidden void @fshri16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr ; GFX10-LABEL: fshri16_88: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3060,6 +3099,7 @@ define hidden void @fshli16_1347(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p ; GFX10-LABEL: fshli16_1347: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3100,6 +3140,7 @@ define hidden void @fshli16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr ; GFX10-LABEL: fshli16_16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3140,6 +3181,7 @@ define hidden void @fshli16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr ; GFX10-LABEL: fshli16_24: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3180,6 +3222,7 @@ define hidden void @fshli16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr ; GFX10-LABEL: fshli16_32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3220,6 +3263,7 @@ define hidden void @fshli16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr ; GFX10-LABEL: fshli16_88: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3260,6 +3304,7 @@ define hidden void @shlbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr ad ; GFX10-LABEL: shlbase: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v7, v[0:1], off ; GFX10-NEXT: global_load_dword v8, v[2:3], off ; GFX10-NEXT: v_add_nc_u32_e32 v0, 16, v6 @@ -3321,6 +3366,7 @@ define hidden void @extractbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt ; GFX10-LABEL: extractbase: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v7, v[0:1], off ; GFX10-NEXT: global_load_dword v8, v[2:3], off ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v6 @@ -3379,6 +3425,7 @@ define hidden void @extract_hilo(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p ; GFX10-LABEL: extract_hilo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[2:3], off ; GFX10-NEXT: global_load_dword v7, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3421,6 +3468,7 @@ define hidden void @extract_lohi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p ; GFX10-LABEL: extract_lohi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[2:3], off offset:4 ; GFX10-NEXT: global_load_dword v7, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3463,6 +3511,7 @@ define hidden void @extract_hihi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p ; GFX10-LABEL: extract_hihi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[2:3], off offset:4 ; GFX10-NEXT: global_load_dword v7, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3584,6 +3633,7 @@ define hidden void @extract_3src(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p ; GFX10-LABEL: extract_3src: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX10-NEXT: global_load_dword v8, v[2:3], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index e452af7d60c0c..c206e1536aa68 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -102,14 +102,14 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff8000, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s35 @@ -172,6 +172,7 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -238,6 +239,7 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 @@ -480,14 +482,14 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX900-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX900-NEXT: v_mov_b32_e32 v31, v0 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX900-NEXT: v_mov_b32_e32 v31, v0 ; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_mov_b32 s32, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX900-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 17, v0 ; GFX900-NEXT: v_and_b32_e32 v6, 0xfe000000, v0 @@ -597,6 +599,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -709,14 +712,14 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v31, v0 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX90A-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 17, v0 ; GFX90A-NEXT: v_and_b32_e32 v0, 0xfe000000, v0 @@ -817,6 +820,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 @@ -1126,14 +1130,14 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff8000, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s35 @@ -1183,6 +1187,7 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -1243,6 +1248,7 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 @@ -1409,14 +1415,14 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff8000, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s35 @@ -1458,6 +1464,7 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -1501,6 +1508,7 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 @@ -1631,14 +1639,14 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff8000, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s35 @@ -1676,6 +1684,7 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -1714,6 +1723,7 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 @@ -1812,13 +1822,13 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x2000, v0 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x2800, v12 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[4:5] ; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9] ; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x3000, v12 ; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v13, vcc -; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9] ; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0x3800, v12 ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc @@ -1852,51 +1862,51 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff8000, v0 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff8000, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s37 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s36, v16 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s36, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v0, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s39 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, s38, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v15, vcc, s38, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v0, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x2000, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2048 -; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x3000, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v11, vcc -; GFX9-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 -; GFX9-NEXT: global_load_dwordx2 v[12:13], v[2:3], off -; GFX9-NEXT: global_load_dwordx2 v[14:15], v[2:3], off offset:2048 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0x2000, v15 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v16, vcc +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[0:1], off offset:2048 +; GFX9-NEXT: global_load_dwordx2 v[10:11], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[12:13], v[4:5], off offset:2048 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v15 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v16, vcc +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2048 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v7, vcc ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v12, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v13, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v13, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v15, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: global_store_dwordx2 v16, v[0:1], s[36:37] +; GFX9-NEXT: global_store_dwordx2 v14, v[0:1], s[36:37] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: DiffBase: @@ -1911,6 +1921,7 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX10-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -1939,6 +1950,7 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX10-NEXT: global_load_dwordx2 v[10:11], v[2:3], off ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x3800, v12 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v13, vcc_lo +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[12:13], v[0:1], off ; GFX10-NEXT: global_load_dwordx2 v[14:15], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(4) @@ -1964,6 +1976,7 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_load_b128 s[36:39], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 @@ -1987,8 +2000,8 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, 0x3000, v8 ; GFX11-NEXT: global_load_b64 v[6:7], v[2:3], off offset:-4096 ; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: s_clause 0x4 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048 -; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[4:5], v[4:5], off offset:2048 ; GFX11-NEXT: global_load_b64 v[10:11], v[8:9], off ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off @@ -2142,14 +2155,14 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff8000, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s35 @@ -2211,6 +2224,7 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -2281,6 +2295,7 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 @@ -2437,14 +2452,14 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff8000, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s35 @@ -2477,6 +2492,7 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -2511,6 +2527,7 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll index d89e57245e8ea..af4475907823b 100644 --- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll @@ -3212,32 +3212,31 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) { define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %i21, ptr addrspace(1) nocapture noundef writeonly align 4 %arg, i32 noundef %arg1) #1 { ; GFX67-LABEL: compute_mad: ; GFX67: ; %bb.0: ; %bb -; GFX67-NEXT: s_load_dword s0, s[4:5], 0x6 +; GFX67-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 +; GFX67-NEXT: s_load_dword s2, s[4:5], 0x6 +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4 ; GFX67-NEXT: s_mov_b32 s3, 0xf000 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_add_i32 s0, s0, 1 -; GFX67-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX67-NEXT: v_add_i32_e32 v2, vcc, s0, v1 +; GFX67-NEXT: s_load_dwordx2 s[4:5], s[12:13], 0x0 +; GFX67-NEXT: s_load_dword s6, s[14:15], 0x1 +; GFX67-NEXT: s_add_i32 s2, s2, 1 +; GFX67-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX67-NEXT: v_add_i32_e32 v2, vcc, s2, v1 ; GFX67-NEXT: v_mul_lo_u32 v2, v2, v0 ; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4 -; GFX67-NEXT: v_mul_lo_u32 v3, v2, v1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_load_dword s2, s[14:15], 0x1 -; GFX67-NEXT: s_load_dwordx2 s[4:5], s[12:13], 0x0 +; GFX67-NEXT: s_and_b32 s2, s6, 0xffff +; GFX67-NEXT: v_mul_lo_u32 v3, v2, v1 +; GFX67-NEXT: s_mul_i32 s8, s8, s2 +; GFX67-NEXT: v_add_i32_e32 v0, vcc, s8, v0 ; GFX67-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX67-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v3 -; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_and_b32 s2, s2, 0xffff +; GFX67-NEXT: s_mov_b32 s2, 0 ; GFX67-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX67-NEXT: s_mul_i32 s8, s8, s2 -; GFX67-NEXT: v_add_i32_e32 v0, vcc, s8, v0 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX67-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX67-NEXT: v_mov_b32_e32 v2, s5 -; GFX67-NEXT: s_mov_b32 s2, 0 ; GFX67-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX67-NEXT: v_add_i32_e32 v3, vcc, v3, v1 ; GFX67-NEXT: v_mul_lo_u32 v4, v3, v1 @@ -3250,31 +3249,30 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) % ; ; GFX8-LABEL: compute_mad: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x18 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_add_i32 s0, s0, 1 -; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v1 -; GFX8-NEXT: v_mul_lo_u32 v2, v2, v0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 1, v1 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x18 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[2:3], 0x4 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v3 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX8-NEXT: v_mul_lo_u32 v3, v1, v2 +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x4 +; GFX8-NEXT: s_add_i32 s6, s6, 1 +; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v1 +; GFX8-NEXT: v_mul_lo_u32 v2, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 1, v1 ; GFX8-NEXT: s_mul_i32 s8, s8, s2 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v3 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 @@ -3289,29 +3287,28 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) % ; ; GFX900-LABEL: compute_mad: ; GFX900: ; %bb.0: ; %bb -; GFX900-NEXT: s_load_dword s0, s[4:5], 0x18 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_add_i32 s0, s0, 1 -; GFX900-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX900-NEXT: v_add_u32_e32 v2, s0, v1 -; GFX900-NEXT: v_mul_lo_u32 v2, v2, v0 -; GFX900-NEXT: v_add_u32_e32 v1, 1, v1 ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dword s9, s[4:5], 0x18 ; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_load_dword s9, s[2:3], 0x4 ; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX900-NEXT: v_mul_lo_u32 v3, v2, v1 +; GFX900-NEXT: s_load_dword s10, s[2:3], 0x4 +; GFX900-NEXT: s_add_i32 s9, s9, 1 +; GFX900-NEXT: v_mul_lo_u32 v1, s9, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, s7 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_and_b32 s0, s9, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, s5 +; GFX900-NEXT: s_and_b32 s0, s10, 0xffff +; GFX900-NEXT: v_add_u32_e32 v2, s9, v1 +; GFX900-NEXT: v_mul_lo_u32 v2, v2, v0 +; GFX900-NEXT: v_add_u32_e32 v1, 1, v1 +; GFX900-NEXT: s_mul_i32 s8, s8, s0 +; GFX900-NEXT: v_add_u32_e32 v0, s8, v0 +; GFX900-NEXT: v_mul_lo_u32 v3, v2, v1 ; GFX900-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX900-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX900-NEXT: v_add_u32_e32 v2, 1, v3 -; GFX900-NEXT: s_mul_i32 s8, s8, s0 -; GFX900-NEXT: v_add_u32_e32 v0, s8, v0 ; GFX900-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, s5 ; GFX900-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX900-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX900-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, v3, v[1:2] @@ -3326,11 +3323,13 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) % ; ; GFX90A-LABEL: compute_mad: ; GFX90A: ; %bb.0: ; %bb -; GFX90A-NEXT: s_load_dword s9, s[4:5], 0x18 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dword s9, s[4:5], 0x18 ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x4 ; GFX90A-NEXT: s_add_i32 s9, s9, 1 ; GFX90A-NEXT: v_mul_lo_u32 v0, s9, v4 ; GFX90A-NEXT: v_add_u32_e32 v1, s9, v0 @@ -3341,18 +3340,16 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) % ; GFX90A-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX90A-NEXT: v_add_u32_e32 v1, 1, v2 ; GFX90A-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX90A-NEXT: s_load_dword s4, s[2:3], 0x4 ; GFX90A-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX90A-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX90A-NEXT: v_mad_u64_u32 v[2:3], s[2:3], v0, v2, v[0:1] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX90A-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, v0, v[2:3] +; GFX90A-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v2, v[0:1] +; GFX90A-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v0, v[2:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s2, s4, 0xffff -; GFX90A-NEXT: s_mul_i32 s8, s8, s2 +; GFX90A-NEXT: s_and_b32 s0, s10, 0xffff +; GFX90A-NEXT: s_mul_i32 s8, s8, s0 ; GFX90A-NEXT: v_add_u32_e32 v1, s8, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, s1 -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s5 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s4, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX90A-NEXT: v_mov_b32_e32 v1, s7 @@ -3363,25 +3360,26 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) % ; ; GFX10-LABEL: compute_mad: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x18 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_i32 s0, s0, 1 -; GFX10-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v2, s0, v1 +; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x18 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 -; GFX10-NEXT: v_mul_lo_u32 v2, v2, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, v2, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x4 +; GFX10-NEXT: s_add_i32 s6, s6, 1 +; GFX10-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v2, s6, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1 +; GFX10-NEXT: v_mul_lo_u32 v2, v2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: v_mul_lo_u32 v3, v2, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v3, v1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v3 ; GFX10-NEXT: v_mul_lo_u32 v4, v2, v1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s2, s2, 0xffff ; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, s8, s2, v[0:1] ; GFX10-NEXT: v_mul_lo_u32 v1, v3, v2 diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index 0a746b0a3f572..9a5bf220d200c 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -344,13 +344,13 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 48, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_ushort v2, v[2:3] ; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: flat_load_ushort v1, v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b16_e32 v1, v2, v0 -; GFX8-NEXT: v_sub_u16_e32 v2, 0, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v0, v2, v0 -; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, v1, v0 +; GFX8-NEXT: v_sub_u16_e32 v1, 0, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, v1, v0 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_store_short v[0:1], v2 @@ -360,6 +360,7 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add ; GFX10-LABEL: test_rotl_i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v6, v[2:3], off offset:48 ; GFX10-NEXT: global_load_ushort v7, v[0:1], off offset:32 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -374,6 +375,7 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add ; GFX11-TRUE16-LABEL: test_rotl_i16: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v[2:3], off offset:48 ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off offset:32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -389,6 +391,7 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add ; GFX11-FAKE16-LABEL: test_rotl_i16: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_u16 v2, v[2:3], off offset:48 ; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off offset:32 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index d6e361d6e297e..4a13418405efb 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -301,13 +301,13 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 48, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_ushort v2, v[2:3] ; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: flat_load_ushort v1, v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v0 -; GFX8-NEXT: v_sub_u16_e32 v2, 0, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v1, v0 +; GFX8-NEXT: v_sub_u16_e32 v1, 0, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v1, v0 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_store_short v[0:1], v2 @@ -317,6 +317,7 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add ; GFX10-LABEL: test_rotr_i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v6, v[2:3], off offset:48 ; GFX10-NEXT: global_load_ushort v7, v[0:1], off offset:32 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -331,6 +332,7 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add ; GFX11-TRUE16-LABEL: test_rotr_i16: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v[2:3], off offset:48 ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off offset:32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -346,6 +348,7 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add ; GFX11-FAKE16-LABEL: test_rotr_i16: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_u16 v2, v[2:3], off offset:48 ; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off offset:32 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll index 8ad6a4e534d23..860223d4192ae 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll @@ -7,15 +7,15 @@ define void @extracted_values(ptr %ret_struct, ptr addrspace(3) %arg0, ptr addrs ; CHECK-LABEL: extracted_values: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_b32 v3, v3 ; CHECK-NEXT: ds_read_b32 v4, v4 -; CHECK-NEXT: ds_read_b32 v2, v2 ; CHECK-NEXT: ds_read_b32 v5, v5 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: ds_read_b32 v3, v3 +; CHECK-NEXT: ds_read_b32 v2, v2 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) ; CHECK-NEXT: v_sub_f16_sdwa v6, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; CHECK-NEXT: v_sub_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_sub_f16_sdwa v7, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; CHECK-NEXT: v_sub_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; CHECK-NEXT: v_sub_f16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; CHECK-NEXT: v_add_f16_e32 v4, v6, v7 ; CHECK-NEXT: v_add_f16_e32 v2, v3, v2 diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index 21719226710de..b6d47267e23df 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -764,81 +764,81 @@ define amdgpu_kernel void @select_v2f16( ; SI-NEXT: s_mov_b32 s21, s13 ; SI-NEXT: s_mov_b32 s22, s2 ; SI-NEXT: s_mov_b32 s23, s3 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: s_mov_b32 s12, s14 ; SI-NEXT: s_mov_b32 s13, s15 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 -; SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v1, off, s[20:23], 0 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v3, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cmp_lt_f32_e32 vcc, v5, v6 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; SI-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2 +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5 +; SI-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: select_v2f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s22, s6 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 +; VI-NEXT: s_mov_b32 s18, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s20, s12 -; VI-NEXT: s_mov_b32 s21, s13 -; VI-NEXT: s_mov_b32 s23, s7 ; VI-NEXT: s_mov_b32 s16, s10 ; VI-NEXT: s_mov_b32 s17, s11 -; VI-NEXT: s_mov_b32 s18, s6 ; VI-NEXT: s_mov_b32 s19, s7 -; VI-NEXT: buffer_load_dword v0, off, s[20:23], 0 -; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 -; VI-NEXT: s_mov_b32 s2, s6 -; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: s_mov_b32 s20, s12 +; VI-NEXT: s_mov_b32 s21, s13 +; VI-NEXT: s_mov_b32 s22, s6 +; VI-NEXT: s_mov_b32 s23, s7 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; VI-NEXT: s_mov_b32 s12, s14 ; VI-NEXT: s_mov_b32 s13, s15 ; VI-NEXT: s_mov_b32 s14, s6 ; VI-NEXT: s_mov_b32 s15, s7 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], 0 -; VI-NEXT: buffer_load_dword v3, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], 0 ; VI-NEXT: s_mov_b32 s4, s8 ; VI-NEXT: s_mov_b32 s5, s9 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v5, v4 -; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], v1, v0 +; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], v0, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e64 v0, v2, v3, s[0:1] -; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[0:1] +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -850,39 +850,41 @@ define amdgpu_kernel void @select_v2f16( ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x44 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2 -; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3 ; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3 -; GFX11-TRUE16-NEXT: s_mov_b32 s6, s2 -; GFX11-TRUE16-NEXT: s_mov_b32 s7, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3 ; GFX11-TRUE16-NEXT: s_mov_b32 s26, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s27, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s3 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s20, s12 -; GFX11-TRUE16-NEXT: s_mov_b32 s21, s13 ; GFX11-TRUE16-NEXT: s_mov_b32 s16, s10 ; GFX11-TRUE16-NEXT: s_mov_b32 s17, s11 -; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[20:23], 0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s20, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s21, s13 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[20:23], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s24, s14 ; GFX11-TRUE16-NEXT: s_mov_b32 s25, s15 -; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v3, off, s[24:27], 0 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[24:27], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v3, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s1, s9 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0.l, v1.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v5.l, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.l, v2.l, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, s0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, s8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -897,40 +899,42 @@ define amdgpu_kernel void @select_v2f16( ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x44 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-FAKE16-NEXT: s_mov_b32 s6, s2 -; GFX11-FAKE16-NEXT: s_mov_b32 s7, s3 -; GFX11-FAKE16-NEXT: s_mov_b32 s22, s2 -; GFX11-FAKE16-NEXT: s_mov_b32 s23, s3 ; GFX11-FAKE16-NEXT: s_mov_b32 s18, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s19, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s22, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s23, s3 ; GFX11-FAKE16-NEXT: s_mov_b32 s26, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s27, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s3 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_mov_b32 s20, s12 -; GFX11-FAKE16-NEXT: s_mov_b32 s21, s13 ; GFX11-FAKE16-NEXT: s_mov_b32 s16, s10 ; GFX11-FAKE16-NEXT: s_mov_b32 s17, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s20, s12 +; GFX11-FAKE16-NEXT: s_mov_b32 s21, s13 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[20:23], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s24, s14 ; GFX11-FAKE16-NEXT: s_mov_b32 s25, s15 -; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[20:23], 0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[16:19], 0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v3, off, s[24:27], 0 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[24:27], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v3, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, s8 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, s9 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v2, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v6, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v5, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v6 :: v_dual_and_b32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-FAKE16-NEXT: s_endpgm @@ -1052,6 +1056,7 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9 ; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10 ; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11 +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5 @@ -1091,6 +1096,7 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6 ; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7 +; GFX11-FAKE16-NEXT: s_clause 0x2 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0 @@ -1228,6 +1234,7 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9 ; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10 ; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11 +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5 @@ -1267,6 +1274,7 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5 ; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6 ; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7 +; GFX11-FAKE16-NEXT: s_clause 0x2 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0 @@ -1355,34 +1363,34 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s18, s10 -; VI-NEXT: s_mov_b32 s19, s11 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s16, s4 -; VI-NEXT: s_mov_b32 s17, s5 ; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s2 ; VI-NEXT: s_mov_b32 s13, s3 -; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 -; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v0 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 +; VI-NEXT: v_mov_b32_e32 v1, 0x3900 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v5, v4 -; VI-NEXT: v_mov_b32_e32 v1, 0x3900 ; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 @@ -1393,28 +1401,29 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; GFX11-TRUE16-NEXT: s_load_b256 s[4:11], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2 -; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3 ; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8 -; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7 -; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10 ; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11 ; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5 ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0.l, v1.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1446,6 +1455,7 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 ; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6 ; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7 +; GFX11-FAKE16-NEXT: s_clause 0x2 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0 @@ -1534,34 +1544,34 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s18, s10 -; VI-NEXT: s_mov_b32 s19, s11 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s16, s4 -; VI-NEXT: s_mov_b32 s17, s5 ; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s2 ; VI-NEXT: s_mov_b32 s13, s3 -; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 -; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; VI-NEXT: v_cmp_lt_f16_e32 vcc, v1, v0 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 +; VI-NEXT: v_mov_b32_e32 v1, 0x3900 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v5, v4 -; VI-NEXT: v_mov_b32_e32 v1, 0x3900 ; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 @@ -1572,28 +1582,29 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; GFX11-TRUE16-NEXT: s_load_b256 s[4:11], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2 -; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3 ; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8 -; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7 -; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10 ; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11 ; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5 ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0.l, v1.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1625,6 +1636,7 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 ; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6 ; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7 +; GFX11-FAKE16-NEXT: s_clause 0x2 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0 @@ -3496,22 +3508,22 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; GFX11-FAKE16-NEXT: scratch_load_b32 v52, off, s32 offset:24 ; GFX11-FAKE16-NEXT: scratch_load_b32 v53, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v54, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v55, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_b32 v64, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_b32 v66, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_b32 v67, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_b32 v68, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v69, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v70, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v71, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v80, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v81, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v82, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v83, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v55, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_b32 v64, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_b32 v66, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_b32 v67, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v68, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v69, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v70, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_b32 v71, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v80, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v81, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v82, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v83, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v84, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v85, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v86, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v85, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v86, off, s32 offset:28 ; GFX11-FAKE16-NEXT: scratch_load_b32 v87, off, s32 offset:128 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v30 @@ -3590,69 +3602,57 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v54 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v98, vcc_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v55 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v66 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v33, v15, vcc_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v64 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v65 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v64 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v14, v97, v14, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v66 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v55 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v67 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v70 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v12, v34, v12, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v68 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v69 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v68 ; GFX11-FAKE16-NEXT: v_perm_b32 v13, v99, v13, 0x5040100 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v10, v36, v10, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v70 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v67 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v71 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v82 ; GFX11-FAKE16-NEXT: v_perm_b32 v11, v35, v11, 0x5040100 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v8, v38, v8, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v80 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v81 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v80 ; GFX11-FAKE16-NEXT: v_perm_b32 v9, v37, v9, 0x5040100 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v6, v48, v6, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v82 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v71 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v83 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v86 ; GFX11-FAKE16-NEXT: v_perm_b32 v7, v39, v7, 0x5040100 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v4, v50, v4, 0x5040100 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v84 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v85 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v83 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v86 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v85 ; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v32, v0, 0x5040100 diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll index 0b68a0534fa08..fbf4ccfe82d6a 100644 --- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -382,20 +382,20 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s10 ; SI-NEXT: s_mov_b32 s13, s11 -; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 +; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 -; SI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc +; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 1.0, v1 +; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1] ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s0, s8 @@ -409,20 +409,20 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s10 ; VI-NEXT: s_mov_b32 s13, s11 -; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 +; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 -; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc +; VI-NEXT: v_cmp_le_f32_e64 s[0:1], 1.0, v1 +; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1] ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: s_mov_b32 s0, s8 @@ -437,22 +437,23 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6 -; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s6 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s10 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s11 -; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, s8 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s9 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 +; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 0, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 1.0, v1 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 @@ -466,22 +467,23 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6 -; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s6 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s7 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s10 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s11 -; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[0:3], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, s8 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, s9 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 +; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 0, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 1.0, v1 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll index b7e6ebaa655b9..43322e6f33685 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -21,8 +21,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr5 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr18 - ; CHECK-NEXT: undef [[COPY7:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr19 + ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr19 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr18 ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr20 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr21 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr22 @@ -36,47 +36,46 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY6]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4) ; CHECK-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_]], 31, implicit-def dead $scc - ; CHECK-NEXT: [[S_ASHR_I32_1:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_1]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub1:sgpr_128 = S_AND_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 65535, implicit-def dead $scc + ; CHECK-NEXT: [[S_ASHR_I32_1:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_1]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ASHR_I32_2:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_2]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 29, implicit-def dead $scc ; CHECK-NEXT: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) poison`, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) poison`, addrspace 4) ; CHECK-NEXT: KILL undef %74:sreg_64 ; CHECK-NEXT: KILL [[S_ADD_U32_]].sub0, [[S_ADD_U32_]].sub1 - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM1]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: KILL undef %89:sgpr_128 - ; CHECK-NEXT: KILL undef %118:sgpr_128 ; CHECK-NEXT: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_1:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_2:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.87, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.93, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.87, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.93, addrspace 4) ; CHECK-NEXT: KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1 ; CHECK-NEXT: KILL [[S_ADD_U32_2]].sub0, [[S_ADD_U32_2]].sub1 ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %169:sreg_32, 31, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %169:sreg_32, implicit-def $scc + ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]], undef %169:sreg_32, implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]].sub0, [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]].sub0, [[S_LSHL_B32_1]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %169:sreg_32, implicit-def $scc + ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]].sub0, undef %169:sreg_32, implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]].sub0, [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: undef [[S_ADD_U32_9:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_9:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc @@ -90,21 +89,21 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %302:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, undef %358:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %368:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.124, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %352:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %352:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, undef %358:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %363:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %368:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4) ; CHECK-NEXT: undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc @@ -114,58 +113,59 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %384:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.134, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.162, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -233, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.124, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.134, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.162, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -217, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -233, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM5]], -249, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_10:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM3]], -297, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -313, implicit-def dead $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4) - ; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -313, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -329, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -345, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4) ; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060 + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1 ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4) ; CHECK-NEXT: [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1 + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %470:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) ; CHECK-NEXT: [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.178, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.183, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc @@ -198,9 +198,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %470:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) ; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1 - ; CHECK-NEXT: KILL undef %470:sreg_64 ; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3 ; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4) @@ -219,10 +217,10 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY18]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc - ; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -475, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -491, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -507, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -539, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc @@ -240,13 +238,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]] ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]] - ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_U32_e64_]], [[V_ADD_U32_e64_1]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -4, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_U32_e64_1]], [[V_ADD_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -4, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_]], [[V_ADD_U32_e64_2]], implicit $exec - ; CHECK-NEXT: [[V_SUBREV_U32_e64_:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 27, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUBREV_U32_e64_:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 27, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_1]], [[V_ADD_U32_e64_3]], implicit $exec ; CHECK-NEXT: [[V_SUBREV_U32_e64_1:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 28, [[BUFFER_LOAD_DWORD_OFFSET]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_3:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_2]], [[V_SUBREV_U32_e64_]], implicit $exec @@ -265,7 +263,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[V_OR_B32_e64_11:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_10]], [[V_SUBREV_U32_e64_5]], implicit $exec ; CHECK-NEXT: [[V_SUBREV_U32_e64_7:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 38, [[BUFFER_LOAD_FORMAT_X_IDXEN7]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_12:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_11]], [[V_SUBREV_U32_e64_6]], implicit $exec - ; CHECK-NEXT: [[V_SUBREV_U32_e64_8:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 39, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUBREV_U32_e64_8:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 39, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_13:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_12]], [[V_SUBREV_U32_e64_7]], implicit $exec ; CHECK-NEXT: [[V_SUBREV_U32_e64_9:%[0-9]+]]:vgpr_32 = V_SUBREV_U32_e64 50, [[BUFFER_LOAD_FORMAT_X_IDXEN8]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_14:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_13]], [[V_SUBREV_U32_e64_8]], implicit $exec @@ -279,13 +277,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[V_OR_B32_e64_18:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_17]], [[V_SUBREV_U32_e64_12]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -73, [[BUFFER_LOAD_FORMAT_X_IDXEN13]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_19:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_18]], [[V_ADD_U32_e64_4]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -74, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -74, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_20:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_19]], [[V_ADD_U32_e64_5]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -75, [[BUFFER_LOAD_FORMAT_X_IDXEN14]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_21:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_20]], [[V_ADD_U32_e64_6]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -77, [[BUFFER_LOAD_FORMAT_X_IDXEN15]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_22:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_21]], [[V_ADD_U32_e64_7]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -93, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -93, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_23:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_22]], [[V_ADD_U32_e64_8]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -94, [[BUFFER_LOAD_FORMAT_X_IDXEN16]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_24:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_23]], [[V_ADD_U32_e64_9]], implicit $exec @@ -307,7 +305,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[V_OR_B32_e64_34:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_33]], [[V_ADD_U32_e64_15]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -197, [[BUFFER_LOAD_FORMAT_X_IDXEN20]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_35:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_34]], [[V_ADD_U32_e64_16]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -216, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -216, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_36:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_35]], [[V_ADD_U32_e64_17]], implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_37:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_36]], [[V_ADD_U32_e64_18]], implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_38:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_7]], [[V_OR_B32_e64_37]], implicit $exec @@ -317,9 +315,9 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[V_OR_B32_e64_42:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_11]], [[V_OR_B32_e64_41]], implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_43:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_12]], [[V_OR_B32_e64_42]], implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_44:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_13]], [[V_OR_B32_e64_43]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -457, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -457, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_45:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_14]], [[V_OR_B32_e64_44]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -458, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -458, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_46:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_45]], [[V_ADD_U32_e64_19]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -459, [[BUFFER_LOAD_FORMAT_X_IDXEN21]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_47:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_46]], [[V_ADD_U32_e64_20]], implicit $exec @@ -340,16 +338,16 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[V_OR_B32_e64_59:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_58]], [[V_ADD_U32_e64_23]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -557, [[BUFFER_LOAD_FORMAT_X_IDXEN25]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_60:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_59]], [[V_ADD_U32_e64_24]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -574, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -574, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_61:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_60]], [[V_ADD_U32_e64_25]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -575, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -575, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_62:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_61]], [[V_ADD_U32_e64_26]], implicit $exec ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM8:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX2_IMM]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -576, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -576, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_63:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_62]], [[V_ADD_U32_e64_27]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -577, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -577, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %543:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4) ; CHECK-NEXT: [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index a3aeea8a145cd..474a69845e635 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -710,19 +710,19 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b64 s[12:13], s[2:3] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[14:15] -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[12:13], s[2:3] -; GFX6-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[12:15], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[12:15], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: s_mov_b32 s8, s0 ; GFX6-NEXT: s_mov_b32 s9, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm ; @@ -800,21 +800,21 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b64 s[12:13], s[2:3] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, 0 ; GFX6-NEXT: s_mov_b64 s[6:7], s[14:15] -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[12:13], s[2:3] -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 -; GFX6-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[12:15], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[12:15], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: s_mov_b32 s8, s0 ; GFX6-NEXT: s_mov_b32 s9, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v6, v2 -; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm ; @@ -935,14 +935,14 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s2, v0 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v0 -; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v0 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[8:9] -; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[12:13] +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[10:11] ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 16, v8 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GFX8-NEXT: v_add_u32_e32 v12, vcc, 16, v12 -; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 16, v10 +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v11, vcc ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GFX8-NEXT: v_mov_b32_e32 v17, s1 diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index 42bd2ff8797a1..1997e2bb10678 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -97,19 +97,17 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 { ; GFX9-LABEL: s_test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 -; GFX9-NEXT: s_load_dword s11, s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_pk_sub_i16 v0, s11, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_pk_sub_i16 v0, s4, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_test_sub_v2i16: @@ -139,6 +137,7 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX10-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 @@ -154,6 +153,7 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 diff --git a/llvm/test/CodeGen/AMDGPU/test-enable-diffbase-clustering-flag.ll b/llvm/test/CodeGen/AMDGPU/test-enable-diffbase-clustering-flag.ll new file mode 100644 index 0000000000000..9a82b5727ba37 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/test-enable-diffbase-clustering-flag.ll @@ -0,0 +1,117 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 -amdgpu-enable-diff-baseptr-mem-clustering=false < %s | FileCheck -check-prefixes=GFX10N %s +define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %i21, ptr addrspace(1) nocapture noundef writeonly align 4 %arg, i32 noundef %arg1) #1 { +; GFX10-LABEL: compute_mad: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x18 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s2, s[2:3], 0x4 +; GFX10-NEXT: s_add_i32 s6, s6, 1 +; GFX10-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v2, s6, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1 +; GFX10-NEXT: v_mul_lo_u32 v2, v2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: v_mul_lo_u32 v3, v2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v3, v1 +; GFX10-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, v2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v1 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, s8, s2, v[0:1] +; GFX10-NEXT: v_mul_lo_u32 v1, v3, v2 +; GFX10-NEXT: v_add_co_u32 v2, s0, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, null, s1, 0, s0 +; GFX10-NEXT: v_mad_u64_u32 v[4:5], null, v1, v4, v[1:2] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v4, v1, v[4:5] +; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, s4, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, null, s5, v3, vcc_lo +; GFX10-NEXT: global_store_dword v[1:2], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX10N-LABEL: compute_mad: +; GFX10N: ; %bb.0: ; %bb +; GFX10N-NEXT: s_load_dword s0, s[4:5], 0x18 +; GFX10N-NEXT: s_waitcnt lgkmcnt(0) +; GFX10N-NEXT: s_add_i32 s0, s0, 1 +; GFX10N-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX10N-NEXT: v_add_nc_u32_e32 v2, s0, v1 +; GFX10N-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10N-NEXT: v_add_nc_u32_e32 v1, 1, v1 +; GFX10N-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GFX10N-NEXT: v_mul_lo_u32 v2, v2, v0 +; GFX10N-NEXT: v_mul_lo_u32 v3, v2, v1 +; GFX10N-NEXT: s_waitcnt lgkmcnt(0) +; GFX10N-NEXT: s_load_dword s2, s[2:3], 0x4 +; GFX10N-NEXT: v_add_nc_u32_e32 v1, v3, v1 +; GFX10N-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10N-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX10N-NEXT: v_add_nc_u32_e32 v1, 1, v3 +; GFX10N-NEXT: v_mul_lo_u32 v4, v2, v1 +; GFX10N-NEXT: s_waitcnt lgkmcnt(0) +; GFX10N-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10N-NEXT: v_add_nc_u32_e32 v3, v4, v1 +; GFX10N-NEXT: v_mad_u64_u32 v[0:1], null, s8, s2, v[0:1] +; GFX10N-NEXT: v_mul_lo_u32 v1, v3, v2 +; GFX10N-NEXT: v_add_co_u32 v2, s0, s0, v0 +; GFX10N-NEXT: v_add_co_ci_u32_e64 v3, null, s1, 0, s0 +; GFX10N-NEXT: v_mad_u64_u32 v[4:5], null, v1, v4, v[1:2] +; GFX10N-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX10N-NEXT: v_mad_u64_u32 v[0:1], null, v4, v1, v[4:5] +; GFX10N-NEXT: v_add_co_u32 v1, vcc_lo, s4, v2 +; GFX10N-NEXT: v_add_co_ci_u32_e64 v2, null, s5, v3, vcc_lo +; GFX10N-NEXT: global_store_dword v[1:2], v0, off +; GFX10N-NEXT: s_endpgm +bb: + %i = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0 + %i2 = add i32 %arg1, 1 + %i3 = mul i32 %i2, %i + %i4 = add i32 %i3, %i2 + %i5 = mul i32 %i4, %i + %i6 = add i32 %i3, 1 + %i7 = mul i32 %i5, %i6 + %i8 = add i32 %i7, %i6 + %i9 = mul i32 %i8, %i5 + %i10 = add i32 %i7, 1 + %i11 = mul i32 %i9, %i10 + %i12 = add i32 %i11, %i10 + %i13 = mul i32 %i12, %i9 + %i14 = add i32 %i11, 1 + %i15 = add i32 %i13, 1 + %i16 = mul i32 %i13, %i14 + %i17 = mul i32 %i16, %i15 + %i19 = load i64, ptr addrspace(4) %i18, align 8 + %i20 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %i22 = getelementptr i8, ptr addrspace(4) %i21, i64 4 + %i23 = load i16, ptr addrspace(4) %i22, align 4 + %i24 = zext i16 %i23 to i32 + %i25 = mul i32 %i20, %i24 + %i26 = add i32 %i25, %i + %i27 = zext i32 %i26 to i64 + %i28 = add i64 %i19, %i27 + %i29 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %i28 + store i32 %i17, ptr addrspace(1) %i29, align 4 + ret void +} + +declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #2 +declare i32 @llvm.amdgcn.workitem.id.x() #2 +declare align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #2 +declare i32 @llvm.amdgcn.workgroup.id.x() #2 +declare i64 @llvm.amdgcn.mul.u24(i32, i32) +declare i64 @llvm.amdgcn.mul.i24(i32, i32) + +attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) } +attributes #1 = { mustprogress nofree nosync nounwind willreturn memory(read, argmem: readwrite, inaccessiblemem: none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } + +!0 = !{i32 0, i32 1024} diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll index eb1b844ad8938..4f622ecb7925d 100644 --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -382,20 +382,20 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s10 ; SI-NEXT: s_mov_b32 s13, s11 -; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 +; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 -; SI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc +; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 1.0, v1 +; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s0, s8 @@ -409,20 +409,20 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s10 ; VI-NEXT: s_mov_b32 s13, s11 -; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 +; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 -; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc +; VI-NEXT: v_cmp_le_f32_e64 s[0:1], 1.0, v1 +; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: s_mov_b32 s0, s8 @@ -437,22 +437,23 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6 -; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s6 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s10 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s11 -; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, s8 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s9 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 +; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 0, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 1.0, v1 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 @@ -466,22 +467,23 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6 -; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s6 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s7 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s10 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s11 -; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 -; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[0:3], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, s8 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, s9 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 +; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 0, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 1.0, v1 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, vcc_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll index 580938f922a04..fd5eb5d80495a 100644 --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -67,6 +67,7 @@ define amdgpu_kernel void @madak_f16( ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 ; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 @@ -92,6 +93,7 @@ define amdgpu_kernel void @madak_f16( ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 ; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 @@ -223,6 +225,7 @@ define amdgpu_kernel void @madak_f16_use_2( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 0x4900, v0.h ; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 0x4900, v0.l +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm @@ -260,6 +263,7 @@ define amdgpu_kernel void @madak_f16_use_2( ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 0x4900, v1 ; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 0x4900, v0 +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-FAKE16-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll index bce7c1e5e8ab7..51fc72be41f36 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll @@ -2080,8 +2080,8 @@ define double @test_vector_reduce_fadd_v16double(double %sp, <16 x double> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fadd_v16double: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GFX9-SDAG-NEXT: scratch_load_dword v31, off, s32 +; GFX9-SDAG-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GFX9-SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] @@ -2097,7 +2097,7 @@ define double @test_vector_reduce_fadd_v16double(double %sp, <16 x double> %v) { ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[24:25] ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[26:27] ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[28:29] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(2) ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[30:31] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[32:33] diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll index 657fe0f0804f3..3b8c3de3e5433 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll @@ -2080,8 +2080,8 @@ define double @test_vector_reduce_fmul_v16double(double %sp, <16 x double> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_fmul_v16double: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GFX9-SDAG-NEXT: scratch_load_dword v31, off, s32 +; GFX9-SDAG-NEXT: scratch_load_dword v33, off, s32 offset:8 ; GFX9-SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4 ; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] @@ -2097,7 +2097,7 @@ define double @test_vector_reduce_fmul_v16double(double %sp, <16 x double> %v) { ; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[24:25] ; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[26:27] ; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[28:29] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(2) ; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[30:31] ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[32:33] diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index 58602a1ccd5ba..10bf1358b2aa8 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -58,6 +58,7 @@ define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_234u: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -69,6 +70,7 @@ define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-LABEL: shuffle_v4f16_234u: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -198,6 +200,7 @@ define <4 x half> @shuffle_v4f16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_3u6u: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -209,6 +212,7 @@ define <4 x half> @shuffle_v4f16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-LABEL: shuffle_v4f16_3u6u: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -219,6 +223,7 @@ define <4 x half> @shuffle_v4f16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-FAKE16-LABEL: shuffle_v4f16_3u6u: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -257,6 +262,7 @@ define <4 x half> @shuffle_v4f16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_3uu7: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -268,6 +274,7 @@ define <4 x half> @shuffle_v4f16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-LABEL: shuffle_v4f16_3uu7: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -278,6 +285,7 @@ define <4 x half> @shuffle_v4f16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-FAKE16-LABEL: shuffle_v4f16_3uu7: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -316,6 +324,7 @@ define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_35u5: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -326,6 +335,7 @@ define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-LABEL: shuffle_v4f16_35u5: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -338,6 +348,7 @@ define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-FAKE16-LABEL: shuffle_v4f16_35u5: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -377,6 +388,7 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_357u: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -388,6 +400,7 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-LABEL: shuffle_v4f16_357u: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[2:3], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -401,6 +414,7 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-FAKE16-LABEL: shuffle_v4f16_357u: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -486,6 +500,7 @@ define <4 x half> @shuffle_v4f16_0145(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_0145: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -497,6 +512,7 @@ define <4 x half> @shuffle_v4f16_0145(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-LABEL: shuffle_v4f16_0145: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -522,6 +538,7 @@ define <4 x half> @shuffle_v4f16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_0167: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -533,6 +550,7 @@ define <4 x half> @shuffle_v4f16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-LABEL: shuffle_v4f16_0167: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -627,6 +645,7 @@ define <4 x half> @shuffle_v4f16_2345(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_2345: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -638,6 +657,7 @@ define <4 x half> @shuffle_v4f16_2345(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-LABEL: shuffle_v4f16_2345: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -663,6 +683,7 @@ define <4 x half> @shuffle_v4f16_2367(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_2367: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -674,6 +695,7 @@ define <4 x half> @shuffle_v4f16_2367(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-LABEL: shuffle_v4f16_2367: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -699,6 +721,7 @@ define <4 x half> @shuffle_v4f16_4501(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_4501: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -710,6 +733,7 @@ define <4 x half> @shuffle_v4f16_4501(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-LABEL: shuffle_v4f16_4501: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v2, v[2:3], off ; GFX11-NEXT: global_load_b32 v1, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(1) @@ -737,6 +761,7 @@ define <4 x half> @shuffle_v4f16_4523(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_4523: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -748,6 +773,7 @@ define <4 x half> @shuffle_v4f16_4523(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-LABEL: shuffle_v4f16_4523: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v2, v[2:3], off ; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) @@ -832,6 +858,7 @@ define <4 x half> @shuffle_v4f16_6701(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_6701: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -843,6 +870,7 @@ define <4 x half> @shuffle_v4f16_6701(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-LABEL: shuffle_v4f16_6701: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(1) @@ -870,6 +898,7 @@ define <4 x half> @shuffle_v4f16_6723(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_6723: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -881,6 +910,7 @@ define <4 x half> @shuffle_v4f16_6723(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-LABEL: shuffle_v4f16_6723: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) @@ -988,6 +1018,7 @@ define <4 x half> @shuffle_v4f16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_2356: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -999,6 +1030,7 @@ define <4 x half> @shuffle_v4f16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-LABEL: shuffle_v4f16_2356: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -1010,6 +1042,7 @@ define <4 x half> @shuffle_v4f16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-FAKE16-LABEL: shuffle_v4f16_2356: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -1048,6 +1081,7 @@ define <4 x half> @shuffle_v4f16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_5623: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -1059,6 +1093,7 @@ define <4 x half> @shuffle_v4f16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-LABEL: shuffle_v4f16_5623: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -1070,6 +1105,7 @@ define <4 x half> @shuffle_v4f16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-FAKE16-LABEL: shuffle_v4f16_5623: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -1097,6 +1133,7 @@ define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_3456: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -1108,6 +1145,7 @@ define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-LABEL: shuffle_v4f16_3456: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v[2:3], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -1121,6 +1159,7 @@ define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-FAKE16-LABEL: shuffle_v4f16_3456: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -1149,6 +1188,7 @@ define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_5634: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -1160,6 +1200,7 @@ define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-LABEL: shuffle_v4f16_5634: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -1173,6 +1214,7 @@ define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-FAKE16-LABEL: shuffle_v4f16_5634: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -1214,6 +1256,7 @@ define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_5734: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -1225,6 +1268,7 @@ define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-LABEL: shuffle_v4f16_5734: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -1237,6 +1281,7 @@ define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-FAKE16-LABEL: shuffle_v4f16_5734: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -1276,6 +1321,7 @@ define <4 x i16> @shuffle_v4i16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %a ; GFX10-LABEL: shuffle_v4i16_2356: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -1287,6 +1333,7 @@ define <4 x i16> @shuffle_v4i16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %a ; GFX11-TRUE16-LABEL: shuffle_v4i16_2356: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -1299,6 +1346,7 @@ define <4 x i16> @shuffle_v4i16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %a ; GFX11-FAKE16-LABEL: shuffle_v4i16_2356: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -1326,6 +1374,7 @@ define <4 x i16> @shuffle_v4i16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %a ; GFX10-LABEL: shuffle_v4i16_0167: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -1337,6 +1386,7 @@ define <4 x i16> @shuffle_v4i16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %a ; GFX11-LABEL: shuffle_v4i16_0167: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1528,6 +1578,7 @@ define <4 x half> @shuffle_v4f16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_6161: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1538,6 +1589,7 @@ define <4 x half> @shuffle_v4f16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-LABEL: shuffle_v4f16_6161: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -1551,6 +1603,7 @@ define <4 x half> @shuffle_v4f16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-FAKE16-LABEL: shuffle_v4f16_6161: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -1736,6 +1789,7 @@ define <4 x half> @shuffle_v8f16_4589(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v8f16_4589: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:8 ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -1747,6 +1801,7 @@ define <4 x half> @shuffle_v8f16_4589(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-LABEL: shuffle_v8f16_4589: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:8 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1772,6 +1827,7 @@ define <4 x half> @shuffle_v8f16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrspace ; GFX10-LABEL: shuffle_v8f16_10_11_2_3: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -1783,6 +1839,7 @@ define <4 x half> @shuffle_v8f16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrspace ; GFX11-LABEL: shuffle_v8f16_10_11_2_3: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) @@ -1821,6 +1878,7 @@ define <4 x half> @shuffle_v8f16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrspace ; GFX10-LABEL: shuffle_v8f16_13_14_2_3: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -1832,6 +1890,7 @@ define <4 x half> @shuffle_v8f16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrspace ; GFX11-TRUE16-LABEL: shuffle_v8f16_13_14_2_3: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off offset:8 ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -1843,6 +1902,7 @@ define <4 x half> @shuffle_v8f16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrspace ; GFX11-FAKE16-LABEL: shuffle_v8f16_13_14_2_3: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off offset:8 ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -1980,6 +2040,7 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx3 v[0:2], v[5:6], off ; GFX10-NEXT: global_load_dword v7, v[3:4], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -1992,6 +2053,7 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1) ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off ; GFX11-NEXT: global_load_b32 v3, v[3:4], off ; GFX11-NEXT: s_waitcnt vmcnt(1) @@ -2138,6 +2200,7 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_0456: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 @@ -2150,6 +2213,7 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-LABEL: shuffle_v4f16_0456: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -2161,6 +2225,7 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-FAKE16-LABEL: shuffle_v4f16_0456: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-FAKE16-NEXT: global_load_b64 v[1:2], v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -2255,6 +2320,7 @@ define <2 x half> @low16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX10-LABEL: low16bits_v2f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2264,6 +2330,7 @@ define <2 x half> @low16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX11-TRUE16-LABEL: low16bits_v2f16: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -2273,6 +2340,7 @@ define <2 x half> @low16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX11-FAKE16-LABEL: low16bits_v2f16: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -2310,6 +2378,7 @@ define <2 x half> @hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX10-LABEL: hi16bits_v2f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2319,6 +2388,7 @@ define <2 x half> @hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX11-TRUE16-LABEL: hi16bits_v2f16: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -2331,6 +2401,7 @@ define <2 x half> @hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX11-FAKE16-LABEL: hi16bits_v2f16: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -2368,6 +2439,7 @@ define <2 x half> @low16hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x ; GFX10-LABEL: low16hi16bits_v2f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2377,6 +2449,7 @@ define <2 x half> @low16hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x ; GFX11-LABEL: low16hi16bits_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2403,6 +2476,7 @@ define <2 x half> @hi16low16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) % ; GFX10-LABEL: hi16low16bits_v2bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2412,6 +2486,7 @@ define <2 x half> @hi16low16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) % ; GFX11-TRUE16-LABEL: hi16low16bits_v2bf16: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -2423,6 +2498,7 @@ define <2 x half> @hi16low16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) % ; GFX11-FAKE16-LABEL: hi16low16bits_v2bf16: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -2460,6 +2536,7 @@ define <2 x i16> @i16_low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX10-LABEL: i16_low16bits: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2469,6 +2546,7 @@ define <2 x i16> @i16_low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX11-TRUE16-LABEL: i16_low16bits: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -2478,6 +2556,7 @@ define <2 x i16> @i16_low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX11-FAKE16-LABEL: i16_low16bits: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -2515,6 +2594,7 @@ define <2 x i16> @i16_low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) ; GFX10-LABEL: i16_low16hi16bits: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2524,6 +2604,7 @@ define <2 x i16> @i16_low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) ; GFX11-LABEL: i16_low16hi16bits: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2550,6 +2631,7 @@ define <2 x i16> @i16_hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) ; GFX10-LABEL: i16_hi16low16bits: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2559,6 +2641,7 @@ define <2 x i16> @i16_hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) ; GFX11-TRUE16-LABEL: i16_hi16low16bits: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -2571,6 +2654,7 @@ define <2 x i16> @i16_hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) ; GFX11-FAKE16-LABEL: i16_hi16low16bits: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -2608,6 +2692,7 @@ define <2 x i16> @i16_hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX10-LABEL: i16_hi16bits: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2617,6 +2702,7 @@ define <2 x i16> @i16_hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX11-TRUE16-LABEL: i16_hi16bits: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -2630,6 +2716,7 @@ define <2 x i16> @i16_hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX11-FAKE16-LABEL: i16_hi16bits: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -2721,6 +2808,7 @@ define void @shuffle_v8f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1 ; GFX10-LABEL: shuffle_v8f16_concat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2730,6 +2818,7 @@ define void @shuffle_v8f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1 ; GFX11-LABEL: shuffle_v8f16_concat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2758,6 +2847,7 @@ define void @shuffle_v16f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg ; GFX10-LABEL: shuffle_v16f16_concat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off ; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -2769,6 +2859,7 @@ define void @shuffle_v16f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg ; GFX11-LABEL: shuffle_v16f16_concat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(1) @@ -2805,10 +2896,9 @@ define void @shuffle_v32f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg ; GFX10-LABEL: shuffle_v32f16_concat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_clause 0x3 ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off ; GFX10-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16 -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off ; GFX10-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(3) @@ -2824,10 +2914,9 @@ define void @shuffle_v32f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg ; GFX11-LABEL: shuffle_v32f16_concat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off ; GFX11-NEXT: global_load_b128 v[10:13], v[2:3], off offset:16 -; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(3) @@ -2860,6 +2949,7 @@ define void @shuffle_v8i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1 ; GFX10-LABEL: shuffle_v8i16_concat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2869,6 +2959,7 @@ define void @shuffle_v8i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1 ; GFX11-LABEL: shuffle_v8i16_concat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2897,6 +2988,7 @@ define void @shuffle_v16i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg ; GFX10-LABEL: shuffle_v16i16_concat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off ; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -2908,6 +3000,7 @@ define void @shuffle_v16i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg ; GFX11-LABEL: shuffle_v16i16_concat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(1) @@ -2944,10 +3037,9 @@ define void @shuffle_v32i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg ; GFX10-LABEL: shuffle_v32i16_concat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_clause 0x3 ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off ; GFX10-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16 -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off ; GFX10-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(3) @@ -2963,10 +3055,9 @@ define void @shuffle_v32i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg ; GFX11-LABEL: shuffle_v32i16_concat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off ; GFX11-NEXT: global_load_b128 v[10:13], v[2:3], off offset:16 -; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(3) @@ -3012,6 +3103,7 @@ define void @shuffle_v4i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ; GFX10-LABEL: shuffle_v4i8_concat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: global_load_short_d16_hi v0, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3030,6 +3122,7 @@ define void @shuffle_v4i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ; GFX11-FAKE16-LABEL: shuffle_v4i8_concat: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off ; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -3056,6 +3149,7 @@ define void @shuffle_v8i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ; GFX10-LABEL: shuffle_v8i8_concat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[0:1], off ; GFX10-NEXT: global_load_dword v7, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3065,6 +3159,7 @@ define void @shuffle_v8i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ; GFX11-LABEL: shuffle_v8i8_concat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3091,6 +3186,7 @@ define void @shuffle_v16i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1 ; GFX10-LABEL: shuffle_v16i8_concat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3100,6 +3196,7 @@ define void @shuffle_v16i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1 ; GFX11-LABEL: shuffle_v16i8_concat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3128,6 +3225,7 @@ define void @shuffle_v32i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1 ; GFX10-LABEL: shuffle_v32i8_concat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off ; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -3139,6 +3237,7 @@ define void @shuffle_v32i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1 ; GFX11-LABEL: shuffle_v32i8_concat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(1) @@ -3167,6 +3266,7 @@ define void @shuffle_v4i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1 ; GFX10-LABEL: shuffle_v4i32_concat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3176,6 +3276,7 @@ define void @shuffle_v4i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1 ; GFX11-LABEL: shuffle_v4i32_concat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3204,6 +3305,7 @@ define void @shuffle_v8i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1 ; GFX10-LABEL: shuffle_v8i32_concat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off ; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -3215,6 +3317,7 @@ define void @shuffle_v8i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1 ; GFX11-LABEL: shuffle_v8i32_concat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(1) @@ -3251,10 +3354,9 @@ define void @shuffle_v16i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg ; GFX10-LABEL: shuffle_v16i32_concat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_clause 0x3 ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off ; GFX10-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16 -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off ; GFX10-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(3) @@ -3270,10 +3372,9 @@ define void @shuffle_v16i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg ; GFX11-LABEL: shuffle_v16i32_concat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off ; GFX11-NEXT: global_load_b128 v[10:13], v[2:3], off offset:16 -; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(3) @@ -3345,6 +3446,7 @@ define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_234u: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -3356,6 +3458,7 @@ define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-LABEL: shuffle_v4bf16_234u: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3485,6 +3588,7 @@ define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_3u6u: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -3496,6 +3600,7 @@ define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-LABEL: shuffle_v4bf16_3u6u: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -3506,6 +3611,7 @@ define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-FAKE16-LABEL: shuffle_v4bf16_3u6u: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -3544,6 +3650,7 @@ define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_3uu7: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -3555,6 +3662,7 @@ define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-LABEL: shuffle_v4bf16_3uu7: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -3565,6 +3673,7 @@ define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-FAKE16-LABEL: shuffle_v4bf16_3uu7: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -3603,6 +3712,7 @@ define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_35u5: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3613,6 +3723,7 @@ define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-LABEL: shuffle_v4bf16_35u5: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -3625,6 +3736,7 @@ define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-FAKE16-LABEL: shuffle_v4bf16_35u5: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -3664,6 +3776,7 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_357u: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -3675,6 +3788,7 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-LABEL: shuffle_v4bf16_357u: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[2:3], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -3688,6 +3802,7 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-FAKE16-LABEL: shuffle_v4bf16_357u: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -3773,6 +3888,7 @@ define <4 x bfloat> @shuffle_v4bf16_0145(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_0145: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -3784,6 +3900,7 @@ define <4 x bfloat> @shuffle_v4bf16_0145(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-LABEL: shuffle_v4bf16_0145: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3809,6 +3926,7 @@ define <4 x bfloat> @shuffle_v4bf16_0167(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_0167: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -3820,6 +3938,7 @@ define <4 x bfloat> @shuffle_v4bf16_0167(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-LABEL: shuffle_v4bf16_0167: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3914,6 +4033,7 @@ define <4 x bfloat> @shuffle_v4bf16_2345(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_2345: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -3925,6 +4045,7 @@ define <4 x bfloat> @shuffle_v4bf16_2345(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-LABEL: shuffle_v4bf16_2345: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3950,6 +4071,7 @@ define <4 x bfloat> @shuffle_v4bf16_2367(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_2367: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -3961,6 +4083,7 @@ define <4 x bfloat> @shuffle_v4bf16_2367(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-LABEL: shuffle_v4bf16_2367: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3986,6 +4109,7 @@ define <4 x bfloat> @shuffle_v4bf16_4501(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_4501: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -3997,6 +4121,7 @@ define <4 x bfloat> @shuffle_v4bf16_4501(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-LABEL: shuffle_v4bf16_4501: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v2, v[2:3], off ; GFX11-NEXT: global_load_b32 v1, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(1) @@ -4024,6 +4149,7 @@ define <4 x bfloat> @shuffle_v4bf16_4523(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_4523: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -4035,6 +4161,7 @@ define <4 x bfloat> @shuffle_v4bf16_4523(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-LABEL: shuffle_v4bf16_4523: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v2, v[2:3], off ; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) @@ -4119,6 +4246,7 @@ define <4 x bfloat> @shuffle_v4bf16_6701(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_6701: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -4130,6 +4258,7 @@ define <4 x bfloat> @shuffle_v4bf16_6701(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-LABEL: shuffle_v4bf16_6701: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(1) @@ -4157,6 +4286,7 @@ define <4 x bfloat> @shuffle_v4bf16_6723(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_6723: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -4168,6 +4298,7 @@ define <4 x bfloat> @shuffle_v4bf16_6723(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-LABEL: shuffle_v4bf16_6723: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) @@ -4275,6 +4406,7 @@ define <4 x bfloat> @shuffle_v4bf16_2356(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_2356: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -4286,6 +4418,7 @@ define <4 x bfloat> @shuffle_v4bf16_2356(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-LABEL: shuffle_v4bf16_2356: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -4297,6 +4430,7 @@ define <4 x bfloat> @shuffle_v4bf16_2356(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-FAKE16-LABEL: shuffle_v4bf16_2356: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -4335,6 +4469,7 @@ define <4 x bfloat> @shuffle_v4bf16_5623(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_5623: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -4346,6 +4481,7 @@ define <4 x bfloat> @shuffle_v4bf16_5623(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-LABEL: shuffle_v4bf16_5623: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -4357,6 +4493,7 @@ define <4 x bfloat> @shuffle_v4bf16_5623(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-FAKE16-LABEL: shuffle_v4bf16_5623: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -4384,6 +4521,7 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_3456: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -4395,6 +4533,7 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-LABEL: shuffle_v4bf16_3456: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v[2:3], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -4408,6 +4547,7 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-FAKE16-LABEL: shuffle_v4bf16_3456: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -4436,6 +4576,7 @@ define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_5634: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -4447,6 +4588,7 @@ define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-LABEL: shuffle_v4bf16_5634: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -4460,6 +4602,7 @@ define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-FAKE16-LABEL: shuffle_v4bf16_5634: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -4501,6 +4644,7 @@ define <4 x bfloat> @shuffle_v4bf16_5734(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_5734: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -4512,6 +4656,7 @@ define <4 x bfloat> @shuffle_v4bf16_5734(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-LABEL: shuffle_v4bf16_5734: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -4524,6 +4669,7 @@ define <4 x bfloat> @shuffle_v4bf16_5734(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-FAKE16-LABEL: shuffle_v4bf16_5734: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -4718,6 +4864,7 @@ define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_6161: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4728,6 +4875,7 @@ define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-LABEL: shuffle_v4bf16_6161: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -4741,6 +4889,7 @@ define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-FAKE16-LABEL: shuffle_v4bf16_6161: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -4926,6 +5075,7 @@ define <4 x bfloat> @shuffle_v8bf16_4589(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v8bf16_4589: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:8 ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -4937,6 +5087,7 @@ define <4 x bfloat> @shuffle_v8bf16_4589(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-LABEL: shuffle_v8bf16_4589: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:8 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4962,6 +5113,7 @@ define <4 x bfloat> @shuffle_v8bf16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrsp ; GFX10-LABEL: shuffle_v8bf16_10_11_2_3: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -4973,6 +5125,7 @@ define <4 x bfloat> @shuffle_v8bf16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrsp ; GFX11-LABEL: shuffle_v8bf16_10_11_2_3: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) @@ -5011,6 +5164,7 @@ define <4 x bfloat> @shuffle_v8bf16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrsp ; GFX10-LABEL: shuffle_v8bf16_13_14_2_3: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -5022,6 +5176,7 @@ define <4 x bfloat> @shuffle_v8bf16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrsp ; GFX11-TRUE16-LABEL: shuffle_v8bf16_13_14_2_3: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off offset:8 ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -5033,6 +5188,7 @@ define <4 x bfloat> @shuffle_v8bf16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrsp ; GFX11-FAKE16-LABEL: shuffle_v8bf16_13_14_2_3: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off offset:8 ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) @@ -5170,6 +5326,7 @@ define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx3 v[0:2], v[5:6], off ; GFX10-NEXT: global_load_dword v7, v[3:4], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -5182,6 +5339,7 @@ define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off ; GFX11-NEXT: global_load_b32 v3, v[3:4], off ; GFX11-NEXT: s_waitcnt vmcnt(1) @@ -5696,6 +5854,7 @@ define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_0456: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 @@ -5708,6 +5867,7 @@ define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-LABEL: shuffle_v4bf16_0456: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -5719,6 +5879,7 @@ define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-FAKE16-LABEL: shuffle_v4bf16_0456: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-FAKE16-NEXT: global_load_b64 v[1:2], v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -5755,6 +5916,7 @@ define <2 x bfloat> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX10-LABEL: low16bits: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5764,6 +5926,7 @@ define <2 x bfloat> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX11-TRUE16-LABEL: low16bits: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -5773,6 +5936,7 @@ define <2 x bfloat> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX11-FAKE16-LABEL: low16bits: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -5810,6 +5974,7 @@ define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) ; GFX10-LABEL: hi16bits_v2bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5819,6 +5984,7 @@ define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) ; GFX11-TRUE16-LABEL: hi16bits_v2bf16: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -5831,6 +5997,7 @@ define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) ; GFX11-FAKE16-LABEL: hi16bits_v2bf16: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -5868,6 +6035,7 @@ define <2 x bfloat> @low16hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) ; GFX10-LABEL: low16hi16bits_v2bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5877,6 +6045,7 @@ define <2 x bfloat> @low16hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) ; GFX11-TRUE16-LABEL: low16hi16bits_v2bf16: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -5886,6 +6055,7 @@ define <2 x bfloat> @low16hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) ; GFX11-FAKE16-LABEL: low16hi16bits_v2bf16: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v2, v[2:3], off ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -5912,6 +6082,7 @@ define <2 x bfloat> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX10-LABEL: hi16low16bits: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5921,6 +6092,7 @@ define <2 x bfloat> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX11-TRUE16-LABEL: hi16low16bits: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) @@ -5932,6 +6104,7 @@ define <2 x bfloat> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX11-FAKE16-LABEL: hi16low16bits: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -5991,6 +6164,7 @@ define void @shuffle_v8bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg ; GFX10-LABEL: shuffle_v8bf16_concat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6000,6 +6174,7 @@ define void @shuffle_v8bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg ; GFX11-LABEL: shuffle_v8bf16_concat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -6028,6 +6203,7 @@ define void @shuffle_v16bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %ar ; GFX10-LABEL: shuffle_v16bf16_concat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off ; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -6039,6 +6215,7 @@ define void @shuffle_v16bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %ar ; GFX11-LABEL: shuffle_v16bf16_concat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(1) @@ -6075,10 +6252,9 @@ define void @shuffle_v32bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %ar ; GFX10-LABEL: shuffle_v32bf16_concat: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_clause 0x3 ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off ; GFX10-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16 -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off ; GFX10-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(3) @@ -6094,10 +6270,9 @@ define void @shuffle_v32bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %ar ; GFX11-LABEL: shuffle_v32bf16_concat: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off ; GFX11-NEXT: global_load_b128 v[10:13], v[2:3], off offset:16 -; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(3) diff --git a/llvm/test/CodeGen/AMDGPU/vselect.ll b/llvm/test/CodeGen/AMDGPU/vselect.ll index 4ce71e1de039b..78490ed6610a2 100644 --- a/llvm/test/CodeGen/AMDGPU/vselect.ll +++ b/llvm/test/CodeGen/AMDGPU/vselect.ll @@ -94,25 +94,23 @@ define amdgpu_kernel void @test_select_v2f32(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: test_select_v2f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_cmp_neq_f32_e32 vcc, s3, v1 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_cmp_neq_f32_e32 vcc, s7, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: test_select_v2f32: @@ -266,33 +264,31 @@ define amdgpu_kernel void @test_select_v4f32(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: test_select_v4f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v3, s11 ; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_cmp_neq_f32_e32 vcc, s7, v3 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_cmp_neq_f32_e32 vcc, s3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v2 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v2 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_cmp_neq_f32_e32 vcc, s1, v1 +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_cmp_neq_f32_e32 vcc, s5, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_cmp_neq_f32_e32 vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: test_select_v4f32: diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll index d10dfcaeba7cc..cf6614f1f3d5e 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll @@ -13,10 +13,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20 ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 -; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 ; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off -; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 ; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off ; GFX12-NEXT: s_endpgm @@ -43,10 +42,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 -; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[22:23], v[30:33], off offset:16 ; GFX12-NEXT: global_store_b128 v[22:23], v[26:29], off -; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16 ; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off ; GFX12-NEXT: s_endpgm @@ -71,6 +69,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off ; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off ; GFX12-NEXT: s_endpgm @@ -95,6 +94,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off ; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off ; GFX12-NEXT: s_endpgm @@ -121,10 +121,9 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 -; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 ; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off -; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off ; GFX12-NEXT: s_endpgm @@ -151,10 +150,9 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 -; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[13:14], v[21:24], off offset:16 ; GFX12-NEXT: global_store_b128 v[13:14], v[17:20], off -; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16 ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off ; GFX12-NEXT: s_endpgm @@ -181,10 +179,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 -; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 ; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off -; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off ; GFX12-NEXT: s_endpgm @@ -211,10 +208,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 -; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 ; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off -; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off ; GFX12-NEXT: s_endpgm @@ -241,10 +237,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 -; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 ; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off -; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off ; GFX12-NEXT: s_endpgm @@ -271,10 +266,9 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 -; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 ; GFX12-NEXT: global_store_b128 v[16:17], v[20:23], off -; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll index 311e76b9bb2b0..f126cadb11247 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll @@ -23,6 +23,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off ; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off ; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off @@ -67,6 +68,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off ; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off ; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off @@ -105,6 +107,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off ; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off ; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off @@ -143,6 +146,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off ; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off ; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off @@ -187,6 +191,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off @@ -221,6 +226,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off ; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off ; GFX12-NEXT: s_endpgm @@ -247,6 +253,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7 ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 +; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off ; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off ; GFX12-NEXT: s_endpgm @@ -283,6 +290,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off @@ -327,6 +335,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off @@ -371,6 +380,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off @@ -415,6 +425,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 +; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll index b7b6028c86dca..fc0e9354021b4 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll @@ -25,10 +25,9 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_f32_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] ; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[28:31], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[26:27], v[16:19], off ; W32-NEXT: s_endpgm @@ -47,10 +46,9 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] ; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[28:31], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[26:27], v[16:19], off ; W32-NEXT: s_endpgm @@ -69,10 +67,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[28:31], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[26:27], v[16:19], off ; W32-NEXT: s_endpgm @@ -89,10 +86,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[28:31], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[26:27], v[16:19], off ; W32-NEXT: s_endpgm @@ -111,10 +107,9 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[28:31], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[26:27], v[16:19], off ; W32-NEXT: s_endpgm @@ -131,10 +126,9 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[28:31], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[26:27], v[16:19], off ; W32-NEXT: s_endpgm @@ -153,10 +147,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[20:23], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off ; W32-NEXT: s_endpgm @@ -173,10 +166,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[0,1,0] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[20:23], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off ; W32-NEXT: s_endpgm @@ -193,10 +185,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,0,0] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[20:23], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off ; W32-NEXT: s_endpgm @@ -213,10 +204,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,1,0] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[20:23], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off ; W32-NEXT: s_endpgm @@ -233,10 +223,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] clamp ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] clamp -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[20:23], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off ; W32-NEXT: s_endpgm @@ -253,10 +242,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32 ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] clamp ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[0,1,0] clamp -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[20:23], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off ; W32-NEXT: s_endpgm @@ -273,10 +261,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32 ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] clamp ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,0,0] clamp -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[20:23], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off ; W32-NEXT: s_endpgm @@ -293,10 +280,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] clamp ; W32-NEXT: v_wmma_i32_16x16x16_iu8 v[8:15], v[4:7], v[4:7], v[8:15] neg_lo:[1,1,0] clamp -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[16:17], v[24:27], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[20:23], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off ; W32-NEXT: s_endpgm @@ -315,10 +301,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[16:19], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off ; W32-NEXT: s_endpgm @@ -335,10 +320,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[0,1,0] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[16:19], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off ; W32-NEXT: s_endpgm @@ -355,10 +339,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,0,0] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[16:19], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off ; W32-NEXT: s_endpgm @@ -375,10 +358,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,1,0] -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[16:19], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off ; W32-NEXT: s_endpgm @@ -396,10 +378,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] clamp ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] clamp -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[16:19], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off ; W32-NEXT: s_endpgm @@ -416,10 +397,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32 ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] clamp ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[0,1,0] clamp -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[16:19], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off ; W32-NEXT: s_endpgm @@ -436,10 +416,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32 ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] clamp ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,0,0] clamp -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[16:19], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off ; W32-NEXT: s_endpgm @@ -456,10 +435,9 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] clamp ; W32-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v[2:3], v[2:3], v[4:11] neg_lo:[1,1,0] clamp -; W32-NEXT: s_clause 0x1 +; W32-NEXT: s_clause 0x3 ; W32-NEXT: global_store_b128 v[12:13], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[16:19], off -; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off ; W32-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll index 524a25cbc1e6d..08660578d2f51 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll @@ -25,6 +25,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f32_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] ; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off ; W64-NEXT: s_endpgm @@ -43,6 +44,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] ; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off ; W64-NEXT: s_endpgm @@ -61,6 +63,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off ; W64-NEXT: s_endpgm @@ -77,6 +80,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off ; W64-NEXT: s_endpgm @@ -95,6 +99,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off ; W64-NEXT: s_endpgm @@ -111,6 +116,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off ; W64-NEXT: s_endpgm @@ -129,6 +135,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off ; W64-NEXT: s_endpgm @@ -146,6 +153,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off ; W64-NEXT: s_endpgm @@ -162,6 +170,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off ; W64-NEXT: s_endpgm @@ -178,6 +187,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off ; W64-NEXT: s_endpgm @@ -194,6 +204,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] clamp ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] clamp +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off ; W64-NEXT: s_endpgm @@ -210,6 +221,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] clamp +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off ; W64-NEXT: s_endpgm @@ -226,6 +238,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] clamp +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off ; W64-NEXT: s_endpgm @@ -242,6 +255,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] clamp +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off ; W64-NEXT: s_endpgm @@ -260,6 +274,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off ; W64-NEXT: s_endpgm @@ -276,6 +291,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off ; W64-NEXT: s_endpgm @@ -292,6 +308,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off ; W64-NEXT: s_endpgm @@ -308,6 +325,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off ; W64-NEXT: s_endpgm @@ -324,6 +342,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] clamp ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] clamp +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off ; W64-NEXT: s_endpgm @@ -340,6 +359,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] clamp +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off ; W64-NEXT: s_endpgm @@ -356,6 +376,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32 ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] clamp +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off ; W64-NEXT: s_endpgm @@ -372,6 +393,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] clamp +; W64-NEXT: s_clause 0x1 ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off ; W64-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 1ca2a8ada68ea..8bdce9dc49060 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -265,11 +265,10 @@ define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) { ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_nop 0 -; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-W64-NEXT: v_add_f32_e32 v0, v2, v3 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-W64-NEXT: ; return to shader part epilog @@ -303,11 +302,10 @@ define amdgpu_ps float @test5_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) { ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_nop 0 -; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-W64-NEXT: v_add_f32_e32 v0, v2, v3 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-W64-NEXT: ; return to shader part epilog @@ -342,11 +340,10 @@ define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) { ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_nop 0 -; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-W64-NEXT: v_add_f32_e32 v0, v2, v3 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-W64-NEXT: ; return to shader part epilog @@ -382,11 +379,10 @@ define amdgpu_ps float @test6_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) { ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_nop 0 -; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-W64-NEXT: v_add_f32_e32 v0, v2, v3 ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-W64-NEXT: ; return to shader part epilog @@ -424,11 +420,10 @@ define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_nop 0 -; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v4, v2, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX9-W64-NEXT: v_add_f32_e32 v1, v3, v4 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-W64-NEXT: ; return to shader part epilog @@ -461,11 +456,10 @@ define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) { ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_nop 0 -; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v4, v2, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-W64-NEXT: v_add_u32_e32 v1, v3, v4 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-W64-NEXT: ; return to shader part epilog @@ -875,11 +869,10 @@ define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) { ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_nop 0 -; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v4, v2, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX9-W64-NEXT: v_add_f32_e32 v1, v3, v4 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-W64-NEXT: ; return to shader part epilog @@ -914,11 +907,10 @@ define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) { ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_nop 0 -; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v4, v2, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-W64-NEXT: v_add_u32_e32 v1, v3, v4 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-W64-NEXT: ; return to shader part epilog @@ -1307,15 +1299,14 @@ define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) { ; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-W64-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-W64-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_nop 0 -; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen -; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec +; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen ; GFX9-W64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; GFX9-W64-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-W64-NEXT: v_add_u32_e32 v1, v3, v2 ; GFX9-W64-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_endpgm ; @@ -2325,11 +2316,10 @@ define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) { ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_nop 0 -; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v4, v2, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX9-W64-NEXT: v_add_f32_e32 v1, v3, v4 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-W64-NEXT: ; return to shader part epilog @@ -2362,11 +2352,10 @@ define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx0, i32 inreg %idx1) { ; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_nop 0 -; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen +; GFX9-W64-NEXT: buffer_load_dword v4, v2, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-W64-NEXT: v_add_u32_e32 v1, v3, v4 ; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-W64-NEXT: ; return to shader part epilog @@ -3270,16 +3259,16 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[16:19], 0 idxen ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen ; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt vmcnt(1) ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 -; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 -; GFX9-W64-NEXT: s_waitcnt vmcnt(1) -; GFX9-W64-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-W64-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[20:21] ; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 @@ -3315,6 +3304,9 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: buffer_load_dword v0, v3, s[16:19], 0 idxen ; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 @@ -3322,9 +3314,6 @@ define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D -; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20 ; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll index 00bb7b24786f5..76ef62f2b14cb 100644 --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -9,20 +9,20 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_xor_b32_e32 v1, v3, v1 -; SI-NEXT: v_xor_b32_e32 v0, v2, v0 +; SI-NEXT: v_xor_b32_e32 v1, v1, v3 +; SI-NEXT: v_xor_b32_e32 v0, v0, v2 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -58,22 +58,22 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_xor_b32_e32 v3, v7, v3 -; SI-NEXT: v_xor_b32_e32 v2, v6, v2 -; SI-NEXT: v_xor_b32_e32 v1, v5, v1 -; SI-NEXT: v_xor_b32_e32 v0, v4, v0 +; SI-NEXT: v_xor_b32_e32 v3, v3, v7 +; SI-NEXT: v_xor_b32_e32 v2, v2, v6 +; SI-NEXT: v_xor_b32_e32 v1, v1, v5 +; SI-NEXT: v_xor_b32_e32 v0, v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -111,23 +111,23 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 +; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 -; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 1.0, v1 +; SI-NEXT: s_xor_b64 vcc, vcc, s[0:1] +; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -364,20 +364,20 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s14, s6 ; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_xor_b32_e32 v0, v2, v0 -; SI-NEXT: v_xor_b32_e32 v1, v3, v1 +; SI-NEXT: v_xor_b32_e32 v0, v0, v2 +; SI-NEXT: v_xor_b32_e32 v1, v1, v3 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ;